Bibliometric Data Deduplication and Cross-Field Analysis¶

This notebook processes bibliometric data from multiple CSV files to:

Objectives¶

  1. Load and merge CSV files from different fields (epidemiology, medicine, virology, plant science, literature, religious studies)
  2. Handle duplicates intelligently by tracking all original sources
  3. Verify coverage - ensure all narrow search results are included in corresponding broad searches
  4. Identify cross-field articles - find articles that appear across multiple research fields

Data Structure¶

Each CSV file follows the naming pattern: {field}_{type}_{date}.csv where:

  • Field: epidemiology, medicine, virology, plant_science, literature, religious_studies
  • Type: broad (comprehensive search) or narrow (specific search)
  • Date: collection date (2025-06-05)

Process Overview¶

  1. Load & Validate: Import all CSV files and check column consistency
  2. Parse Metadata: Extract field and search type from filenames
  3. Track Sources: Add tracking columns for original sources and field metadata
  4. Deduplicate: Identify duplicates using DOI, EID, PubMed ID, Link, ISBN in cascade
  5. Coverage Check: Verify all narrow entries exist in corresponding broad searches
  6. Cross-Field Analysis: Identify articles appearing in multiple research fields

Output DataFrames¶

  • master_df: Complete dataset with all metadata and duplicate flags
  • duplicates_df: Only the duplicate entries with reasons for duplication
  • clean_df: Deduplicated dataset with one entry per unique article (includes all_sources tracking)

In [1]:
import sys
import os
import subprocess
import pandas as pd
from collections import defaultdict
from IPython.display import display

# ==============================================================================
# FUNCTION 1: Load CSV files from a folder (unchanged)
# ==============================================================================
def load_csv_from_folder_and_check_columns(folder_path):
    """
    Loads all .csv files from a folder into pandas DataFrames and checks
    if they all have the same columns.
    Returns a dict of DataFrames, with filenames as keys.
    """
    try:
        import pandas as pd
    except ImportError:
        print("pandas not found. Installing...")
        subprocess.check_call([sys.executable, "-m", "pip", "install", "pandas"])
        import pandas as pd

    dataframes = {}
    if not os.path.isdir(folder_path):
        print(f"Error: The specified path '{folder_path}' is not a valid directory.")
        return dataframes

    print(f"Loading .csv files from: {folder_path}")
    for filename in os.listdir(folder_path):
        if filename.endswith(".csv"):
            file_path = os.path.join(folder_path, filename)
            try:
                df_name = os.path.splitext(filename)[0]
                dataframes[df_name] = pd.read_csv(file_path)
                print(f" - Loaded '{filename}'")
            except Exception as e:
                print(f" - Error loading '{filename}': {e}")

    if not dataframes:
        print("No .csv files were found or loaded from the directory.")
        return dataframes

    # --- Check if all DataFrames have the same columns ---
    if len(dataframes) > 1:
        print("\n--- Checking for consistent column names ---")
        first_df_name = list(dataframes.keys())[0]
        reference_columns = set(dataframes[first_df_name].columns)
        all_columns_match = True
        for df_name, df in dataframes.items():
            if set(df.columns) != reference_columns:
                print(f" - DataFrame '{df_name}' has different columns.")
                all_columns_match = False
                break
        if all_columns_match:
            print("Success: All loaded DataFrames have the same columns.")
        else:
            print("Warning: Not all DataFrames have the same columns.")
    return dataframes

# ==============================================================================
# FUNCTION 2: Parse field and type from filename (multi-word field support)
# ==============================================================================
def parse_field_and_type(filename):
    # Example: 'plant_science_broad_2025-06-05.csv' -> ('plant science', 'broad')
    base = os.path.splitext(filename)[0]
    parts = base.split('_')
    if len(parts) < 3:
        return base, ''
    # Field is everything except the last two parts (type and date)
    field = ' '.join(parts[:-2])
    typ = parts[-2]
    return field, typ

# ==============================================================================
# FAST narrow-in-broad CHECK (before merging)
# ==============================================================================
def check_narrow_in_broad_fast(dataframes_dict):
    """
    For each field, check if all narrow entries are present in broad using sets for fast lookup.
    """
    identifier_columns = ['DOI', 'EID', 'PubMed ID', 'Link', 'ISBN']
    def get_row_key(row):
        for col in identifier_columns:
            if col in row and pd.notna(row[col]):
                return f"{col}:{row[col]}"
        if 'Title' in row and 'Year' in row:
            return f"Title:{row['Title']}|Year:{row['Year']}"
        return str(row.name)

    # Organize by field and type
    field_type_to_df = defaultdict(list)
    for name, df in dataframes_dict.items():
        field, typ = parse_field_and_type(name)
        field_type_to_df[(field, typ)].append(df)

    report = {}
    fields = set(field for field, typ in field_type_to_df.keys())
    for field in fields:
        # Merge all broad and all narrow for this field
        broad_df = pd.concat(field_type_to_df.get((field, 'broad'), []), ignore_index=True) if (field, 'broad') in field_type_to_df else pd.DataFrame()
        narrow_df = pd.concat(field_type_to_df.get((field, 'narrow'), []), ignore_index=True) if (field, 'narrow') in field_type_to_df else pd.DataFrame()
        if broad_df.empty or narrow_df.empty:
            continue
        broad_keys = set(get_row_key(row) for _, row in broad_df.iterrows())
        missing_keys = [get_row_key(row) for _, row in narrow_df.iterrows() if get_row_key(row) not in broad_keys]
        if missing_keys:
            report[field] = missing_keys
            print(f"Field '{field}': {len(missing_keys)} narrow entries NOT found in broad.")
        else:
            print(f"Field '{field}': All narrow entries are present in broad.")
    return report

# ==============================================================================
# FUNCTION 3: Merge, track all sources, and check narrow in broad
# ==============================================================================
def process_bibliometric_data_with_sources(dataframes_dict):
    """
    Merges DataFrames, tracks all original sources for each row, and checks that all narrow entries are in broad for each field.
    Returns master_df (with all sources), duplicates_df, clean_df, and a report dict.
    """
    if not dataframes_dict:
        print("Cannot process data: The input dictionary of DataFrames is empty.")
        return pd.DataFrame(), pd.DataFrame(), pd.DataFrame(), {}

    # --- 1. Add metadata: field, type, and filename to each row ---
    records = []
    for name, df in dataframes_dict.items():
        field, typ = parse_field_and_type(name)
        df = df.copy()
        df['original_source'] = name
        df['field'] = field
        df['broad_or_narrow'] = typ
        records.append(df)
    combined_df = pd.concat(records, ignore_index=True)

    # --- 2. Track all sources for each unique row (by main identifiers) ---
    identifier_columns = ['DOI', 'EID', 'PubMed ID', 'Link', 'ISBN']
    def get_row_key(row):
        for col in identifier_columns:
            if col in row and pd.notna(row[col]):
                return f"{col}:{row[col]}"
        if 'Title' in row and 'Year' in row:
            return f"Title:{row['Title']}|Year:{row['Year']}"
        return str(row.name)

    key_to_sources = defaultdict(set)
    for idx, row in combined_df.iterrows():
        key = get_row_key(row)
        key_to_sources[key].add(row['original_source'])

    # Add a column with all sources for each row
    combined_df['all_sources'] = combined_df.apply(lambda row: list(key_to_sources[get_row_key(row)]), axis=1)

    # --- 3. Identify duplicates (same as before, but keep all sources) ---
    combined_df['is_duplicate'] = False
    combined_df['duplicate_of'] = pd.NA
    combined_df['duplicate_reason'] = pd.NA
    processed_indices = set()
    for column in identifier_columns:
        if column not in combined_df.columns:
            continue
        subset_df = combined_df[combined_df[column].notna() & ~combined_df.index.isin(processed_indices)]
        for identifier_value, group in subset_df.groupby(column):
            if len(group) > 1:
                original_index = group.index[0]
                duplicate_indices = group.index[1:]
                combined_df.loc[duplicate_indices, 'is_duplicate'] = True
                combined_df.loc[duplicate_indices, 'duplicate_of'] = original_index
                reason = f"Same {column}: {identifier_value}"
                combined_df.loc[duplicate_indices, 'duplicate_reason'] = reason
                processed_indices.update(group.index)
    num_duplicates = combined_df['is_duplicate'].sum()
    print(f"Identified {num_duplicates} potential duplicate rows.")

    # --- 4. Check that all narrow entries are in broad for each field (FAST version) ---
    report = check_narrow_in_broad_fast(dataframes_dict)

    # --- 5. Create the final output DataFrames ---
    master_df = combined_df
    duplicates_df = master_df[master_df['is_duplicate']].copy()
    clean_df = master_df[~master_df['is_duplicate']].copy()
    clean_df = clean_df.drop(columns=['is_duplicate', 'duplicate_of', 'duplicate_reason'])
    return master_df, duplicates_df, clean_df, report

# ==============================================================================
# --- MAIN EXECUTION (UPDATED) ---
# ==============================================================================

folder_name = "./Bibliometric data UB"
bibliometric_dataframes = load_csv_from_folder_and_check_columns(folder_name)

if bibliometric_dataframes:
    master_df, duplicates_df, clean_df, report = process_bibliometric_data_with_sources(bibliometric_dataframes)

    print("\n--- Processing Complete ---")
    print(f"Shape of Master DataFrame: {master_df.shape}")
    print(f"Shape of Duplicates DataFrame: {duplicates_df.shape}")
    print(f"Shape of Clean DataFrame: {clean_df.shape}")

    if not duplicates_df.empty:
        print("\n--- Sample of Identified Duplicates (with reason) ---")
        display_cols = ['DOI', 'Title', 'Year', 'duplicate_reason', 'duplicate_of', 'original_source', 'all_sources']
        existing_display_cols = [col for col in display_cols if col in duplicates_df.columns]
        display(duplicates_df[existing_display_cols].head())
    else:
        print("\nNo duplicates were found.")

    print("\n--- Narrow vs Broad Coverage Report ---")
    if report:
        for field, missing in report.items():
            print(f"Field '{field}': {len(missing)} narrow entries NOT found in broad.")
    else:
        print("All narrow entries are present in their corresponding broad datasets.")
Loading .csv files from: ./Bibliometric data UB
 - Loaded 'religious_studies_narrow_2025-06-05.csv'
 - Loaded 'virology_broad_2025-06-05.csv'
 - Loaded 'virology_narrow_2025-06-05.csv'
 - Loaded 'plant_science_broad_2025-06-05.csv'
 - Loaded 'medicine_narrow_2025-06-05.csv'
 - Loaded 'literature_broad_2025-06-05.csv'
 - Loaded 'literature_narrow_2025-06-05.csv'
 - Loaded 'medicine_broad_2025-06-05.csv'
 - Loaded 'epidemiology_narrow_2025-06-05.csv'
 - Loaded 'religious_studies_broad_2025-06-05.csv'
 - Loaded 'plant_science_narrow_2025-06-05.csv'
 - Loaded 'epidemiology_broad_2025-06-05.csv'

--- Checking for consistent column names ---
Success: All loaded DataFrames have the same columns.
Identified 14321 potential duplicate rows.
Field 'virology': All narrow entries are present in broad.
Field 'religious studies': All narrow entries are present in broad.
Field 'epidemiology': All narrow entries are present in broad.
Field 'literature': All narrow entries are present in broad.
Field 'plant science': All narrow entries are present in broad.
Field 'medicine': All narrow entries are present in broad.

--- Processing Complete ---
Shape of Master DataFrame: (28523, 53)
Shape of Duplicates DataFrame: (14321, 53)
Shape of Clean DataFrame: (14202, 50)

--- Sample of Identified Duplicates (with reason) ---
DOI Title Year duplicate_reason duplicate_of original_source all_sources
1576 10.1163/9789004302235_006 Becoming a creature of artful existence: Theol... 2015 Same DOI: 10.1163/9789004302235_006 1407 religious_studies_narrow_2025-06-05 [literature_narrow_2025-06-05, literature_broa...
1789 10.4324/9780203387856 Rosemary Radford Ruether 2013 Same DOI: 10.4324/9780203387856 1716 religious_studies_narrow_2025-06-05 [religious_studies_broad_2025-06-05, religious...
2315 10.1007/s12397-023-09500-2 Jewish Environmentalism in the “Jewish America... 2023 Same DOI: 10.1007/s12397-023-09500-2 396 virology_broad_2025-06-05 [religious_studies_broad_2025-06-05, religious...
2425 10.1558/jsrnc.19664 Religion and Biopolitics in the Time of Corona... 2022 Same DOI: 10.1558/jsrnc.19664 602 virology_broad_2025-06-05 [epidemiology_broad_2025-06-05, religious_stud...
2495 10.5325/jjewiethi.8.1.0025 Beyond Jewish Racial Justice Activism: Can Jew... 2022 Same DOI: 10.5325/jjewiethi.8.1.0025 675 virology_broad_2025-06-05 [epidemiology_broad_2025-06-05, religious_stud...
--- Narrow vs Broad Coverage Report ---
All narrow entries are present in their corresponding broad datasets.
In [2]:
# List articles in clean_df that appear in more than one field (not just narrow/broad of the same field)

# --- CORRECTED LOGIC ---
# The previous logic was flawed because it checked the 'field' column of clean_df,
# but clean_df only contains one row per unique article, so that 'field' is arbitrary.
# The correct way is to parse the fields from the 'all_sources' list for each unique article.

# 1. For each row in the clean dataframe, get the set of unique fields from its sources.
def get_fields_from_sources(sources_list):
    # Use a set to automatically handle uniqueness
    fields = {parse_field_and_type(source)[0] for source in sources_list}
    return list(fields)

# Create a temporary copy to avoid SettingWithCopyWarning
cross_field_df = clean_df.copy()

# Add new columns to store the list of all fields and the count of unique fields
cross_field_df['all_fields'] = cross_field_df['all_sources'].apply(get_fields_from_sources)
cross_field_df['num_fields'] = cross_field_df['all_fields'].apply(len)

# 2. Filter for articles that appear in more than one field.
multi_field_df = cross_field_df[cross_field_df['num_fields'] > 1].copy()

print(f"Found {len(multi_field_df)} articles that appear in more than one field.")

if not multi_field_df.empty:
    # 3. Create a matrix view with checkmarks for each field
    
    # Get all unique fields across all articles
    all_fields = set()
    for fields_list in multi_field_df['all_fields']:
        all_fields.update(fields_list)
    all_fields = sorted(list(all_fields))
    
    # Create the matrix dataframe
    matrix_data = []
    for _, row in multi_field_df.iterrows():
        article_fields = set(row['all_fields'])
        matrix_row = {
            'DOI': row.get('DOI', '')[:20] + '...' if len(str(row.get('DOI', ''))) > 20 else row.get('DOI', ''),
            'Title': row['Title'][:50] + '...' if len(row['Title']) > 50 else row['Title'],
            'Year': row.get('Year', ''),
            'Num_Fields': row['num_fields']
        }
        
        # Add checkmarks for each field
        for field in all_fields:
            matrix_row[field] = '✓' if field in article_fields else ''
        
        matrix_data.append(matrix_row)
    
    # Create the matrix dataframe
    matrix_df = pd.DataFrame(matrix_data)
    
    # Sort by number of fields (descending), then by title
    matrix_df = matrix_df.sort_values(['Num_Fields', 'Title'], ascending=[False, True])
    
    print(f"\nCross-Field Articles Matrix (sorted by number of fields):")
    print(f"Fields found: {', '.join(all_fields)}")
    
    # Display the matrix
    display(matrix_df)
    
    # Summary statistics
    print(f"\nSummary:")
    field_count_summary = matrix_df['Num_Fields'].value_counts().sort_index(ascending=False)
    for num_fields, count in field_count_summary.items():
        print(f"  {count:,} articles appear in {num_fields} fields")
        
else:
    print("\nNo articles appear in multiple fields.")
Found 2512 articles that appear in more than one field.

Cross-Field Articles Matrix (sorted by number of fields):
Fields found: epidemiology, literature, medicine, plant science, religious studies, virology
DOI Title Year Num_Fields epidemiology literature medicine plant science religious studies virology
957 10.1080/13691058.201... 'Transparent sexualities': Sexual openness, HI... 2013 4 ✓ ✓ ✓ ✓
1219 NaN AIDS and American apocalypticism: The cultural... 2004 4 ✓ ✓ ✓ ✓
1124 10.1177/030639680708... AIDS denialism and 'The humanisation of the Af... 2008 4 ✓ ✓ ✓ ✓
613 10.1007/s40615-021-0... Black Nurses Collaborative Approach to Address... 2021 4 ✓ ✓ ✓ ✓
245 10.1007/s40615-023-0... Dual Pandemics: Race-Related COVID Stress and ... 2024 4 ✓ ✓ ✓ ✓
... ... ... ... ... ... ... ... ... ... ...
9 10.1353/jji.2024.a93... “Yiddishists for Palestine”: Diasporism and So... 2024 2 ✓ ✓
1740 10.1016/j.drugpo.202... “You have to make some money before you can do... 2020 2 ✓ ✓
1726 10.1016/j.drugpo.202... “You need to just provide health services:” na... 2020 2 ✓ ✓
401 10.1007/s13178-021-0... “…Everybody Else Is More Privileged. Then It’s... 2022 2 ✓ ✓
1602 10.1136/bmjopen-2021... € Screen and Treat for Anaemia Reduction (STAR... 2021 2 ✓ ✓

2512 rows × 10 columns

Summary:
  16 articles appear in 4 fields
  454 articles appear in 3 fields
  2,042 articles appear in 2 fields
In [3]:
clean_df.to_feather('Clean_df.feather')

Analysis of "Activism" Usage in Bibliometric Data¶

This section provides a statistical analysis of the term "activism" in the deduplicated bibliometric dataset, addressing the specified questions where possible with the available data.

In [4]:
# Inspect available columns in clean_df
print("Available columns in clean_df:")
print(clean_df.columns.tolist())
Available columns in clean_df:
['Authors', 'Author full names', 'Author(s) ID', 'Title', 'Year', 'Source title', 'Volume', 'Issue', 'Art. No.', 'Page start', 'Page end', 'Page count', 'Cited by', 'DOI', 'Link', 'Affiliations', 'Authors with affiliations', 'Abstract', 'Author Keywords', 'Index Keywords', 'Molecular Sequence Numbers', 'Chemicals/CAS', 'Tradenames', 'Manufacturers', 'Funding Details', 'Funding Texts', 'References', 'Correspondence Address', 'Editors', 'Publisher', 'Sponsors', 'Conference name', 'Conference date', 'Conference location', 'Conference code', 'ISSN', 'ISBN', 'CODEN', 'PubMed ID', 'Language of Original Document', 'Abbreviated Source Title', 'Document Type', 'Publication Stage', 'Open Access', 'Source', 'EID', 'original_source', 'field', 'broad_or_narrow', 'all_sources']
In [5]:
# Check what keyword columns we have and their content
print("\nChecking keyword columns:")
for col in ['Author Keywords', 'Index Keywords', 'Keywords']:
    if col in clean_df.columns:
        non_null = clean_df[col].notna().sum()
        print(f"{col}: {non_null} non-null entries ({non_null/len(clean_df)*100:.1f}%)")
        if non_null > 0:
            sample = clean_df[clean_df[col].notna()][col].head(3)
            print(f"  Sample values:")
            for val in sample:
                print(f"    - {str(val)[:100]}")
    else:
        print(f"{col}: Column does not exist")
Checking keyword columns:
Author Keywords: 8939 non-null entries (62.9%)
  Sample values:
    - anti-racism; Catholic higher education; Christian rhetoric; communication pedagogy; racial justice l
    - climate activism; magic; pedagogy; post-human co-existence
    - Islamic missionary movements; Libya; Pan-Arabism; Pan-Islamism; Qaddafi; Qur’an translations; World 
Index Keywords: 3850 non-null entries (27.1%)
  Sample values:
    - Abortion, Induced; Abortion, Legal; Beginning of Human Life; Female; Health Policy; Humans; Personho
    - article; child; climate; climate change; commercial phenomena; eco-anxiety; human; knowledge; male; 
    - Albert Einstein; Buddhisms; Christianities; Christians; Future generations; Interconnectivity; Norwi
Keywords: Column does not exist

Note on Keywords Analysis¶

The dataset contains two separate keyword fields:

  • Author Keywords: Keywords provided by the article authors (62.9% coverage)
  • Index Keywords: Controlled vocabulary keywords assigned by the database (27.1% coverage)

The analysis now checks both keyword fields, so "activism_in_keywords" is True if "activism" appears in either Author Keywords OR Index Keywords.

In [6]:
# Function to check for "activism" in text (case-insensitive)
def contains_activism(text):
    if pd.isna(text):
        return False
    return 'activism' in str(text).lower()

# Add columns indicating presence of "activism" in key fields
clean_df['activism_in_title'] = clean_df['Title'].apply(contains_activism)

if 'Abstract' in clean_df.columns:
    clean_df['activism_in_abstract'] = clean_df['Abstract'].apply(contains_activism)
else:
    clean_df['activism_in_abstract'] = False

# Check both Author Keywords and Index Keywords columns
activism_in_author_kw = False
activism_in_index_kw = False
if 'Author Keywords' in clean_df.columns:
    activism_in_author_kw = clean_df['Author Keywords'].apply(contains_activism)
if 'Index Keywords' in clean_df.columns:
    activism_in_index_kw = clean_df['Index Keywords'].apply(contains_activism)

# Combine: activism is in keywords if it appears in EITHER Author Keywords OR Index Keywords
clean_df['activism_in_keywords'] = activism_in_author_kw | activism_in_index_kw

# Calculate prominence score (number of fields where "activism" appears)
clean_df['activism_prominence'] = clean_df[['activism_in_title', 'activism_in_abstract', 'activism_in_keywords']].sum(axis=1)
In [7]:
# 1. Frequency & Distribution
total_articles = len(clean_df)
activism_articles = clean_df[clean_df['activism_prominence'] > 0]
num_activism_articles = len(activism_articles)

print(f"Total articles: {total_articles}")
print(f"Articles mentioning 'activism': {num_activism_articles} ({num_activism_articles/total_articles*100:.1f}%)")

# Frequency over time
if 'Year' in clean_df.columns:
    yearly_activism = activism_articles.groupby('Year').size().sort_index()
    # Ensure all years are included, even with zero counts
    if not yearly_activism.empty:
        all_years = range(int(yearly_activism.index.min()), int(yearly_activism.index.max()) + 1)
        yearly_activism = yearly_activism.reindex(all_years, fill_value=0)
    print("\nActivisim mentions by year:")
    print(yearly_activism)

# By field
field_activism = activism_articles.groupby('field').size().sort_values(ascending=False)
print("\nActivisim mentions by field:")
print(field_activism)

# By journal (assuming 'Journal' or 'Source title' column)
journal_col = None
for col in ['Journal', 'Source title', 'Source Title']:
    if col in clean_df.columns:
        journal_col = col
        break
if journal_col:
    journal_activism = activism_articles.groupby(journal_col).size().sort_values(ascending=False).head(10)
    print(f"\nTop 10 journals mentioning 'activism' (by {journal_col}):")
    print(journal_activism)

# Visualization of yearly activism mentions
import plotly.express as px
import plotly.graph_objects as go
import plotly.io as pio

# Configure Plotly to include JS inline for HTML export
pio.renderers.default = 'notebook_connected'
import plotly.offline as pyo
pyo.init_notebook_mode(connected=False)

if 'Year' in clean_df.columns and not yearly_activism.empty:
    # Create bar plot
    fig = px.bar(yearly_activism.reset_index(), x='Year', y=0, 
                 title='Activism Mentions by Year',
                 labels={'0': 'Number of Articles', 'Year': 'Publication Year'},
                 color_discrete_sequence=['skyblue'])
    
    # Update layout for legend (though simple here)
    fig.update_layout(showlegend=True, legend_title_text='Legend')
    
    # Ensure every year is shown on x-axis
    fig.update_xaxes(tickmode='array', tickvals=yearly_activism.index.tolist(), ticktext=yearly_activism.index.tolist())
    
    # Show the plot
    pyo.iplot(fig)
Total articles: 14202
Articles mentioning 'activism': 7477 (52.6%)

Activisim mentions by year:
Year
2004     77
2005    100
2006     88
2007    103
2008    116
2009    153
2010    172
2011    207
2012    203
2013    252
2014    276
2015    250
2016    315
2017    354
2018    370
2019    471
2020    535
2021    686
2022    737
2023    835
2024    868
2025    309
dtype: int64

Activisim mentions by field:
field
medicine             2063
religious studies    1982
literature           1925
epidemiology          743
virology              669
plant science          95
dtype: int64

Top 10 journals mentioning 'activism' (by Source title):
Source title
Religions                                                            104
Global Public Health                                                  40
Feminist Formations                                                   29
Social Science and Medicine                                           29
American Journal of Community Psychology                              28
International Journal of Environmental Research and Public Health     25
Research in Drama Education                                           25
Culture, Health and Sexuality                                         25
Canadian Historical Review                                            23
Sociology of Sport Journal                                            22
dtype: int64
In [19]:
# Shared helper functions for geographical analysis
import numpy as np

def extract_country(affiliation):
    """Extract country from affiliation string (assumes format: ..., Country)"""
    if pd.isna(affiliation):
        return None
    parts = str(affiliation).split(',')
    if parts:
        return parts[-1].strip()
    return None

# Comprehensive map of country names to ISO-3 codes (shared for all geographic analyses)
country_iso_mapping = {
    'United States': 'USA', 'United Kingdom': 'GBR', 'Canada': 'CAN', 'Australia': 'AUS',
    'South Africa': 'ZAF', 'Germany': 'DEU', 'India': 'IND', 'Spain': 'ESP',
    'Netherlands': 'NLD', 'Brazil': 'BRA', 'Israel': 'ISR', 'France': 'FRA',
    'Italy': 'ITA', 'Sweden': 'SWE', 'Indonesia': 'IDN', 'New Zealand': 'NZL',
    'Poland': 'POL', 'Ireland': 'IRL', 'Denmark': 'DNK', 'Hong Kong': 'HKG',
    'Finland': 'FIN', 'South Korea': 'KOR', 'Norway': 'NOR', 'Austria': 'AUT',
    'Nigeria': 'NGA', 'Portugal': 'PRT', 'Switzerland': 'CHE', 'Russian Federation': 'RUS',
    'Belgium': 'BEL', 'China': 'CHN', 'Japan': 'JPN', 'Mexico': 'MEX',
    'Argentina': 'ARG', 'Chile': 'CHL', 'Colombia': 'COL', 'Greece': 'GRC',
    'Turkey': 'TUR', 'Egypt': 'EGY', 'Thailand': 'THA', 'Malaysia': 'MYS',
    'Singapore': 'SGP', 'Philippines': 'PHL', 'Vietnam': 'VNM', 'Pakistan': 'PAK',
    'Bangladesh': 'BGD', 'Kenya': 'KEN', 'Ghana': 'GHA', 'Ethiopia': 'ETH',
    'Tanzania': 'TZA', 'Uganda': 'UGA', 'Morocco': 'MAR', 'Algeria': 'DZA',
    'Tunisia': 'TUN', 'Iran': 'IRN', 'Iraq': 'IRQ', 'Saudi Arabia': 'SAU',
    'United Arab Emirates': 'ARE', 'Jordan': 'JOR', 'Lebanon': 'LBN', 'Palestine': 'PSE',
    'Czech Republic': 'CZE', 'Hungary': 'HUN', 'Romania': 'ROU', 'Bulgaria': 'BGR',
    'Croatia': 'HRV', 'Serbia': 'SRB', 'Slovenia': 'SVN', 'Slovakia': 'SVK',
    'Estonia': 'EST', 'Latvia': 'LVA', 'Lithuania': 'LTU', 'Ukraine': 'UKR',
    'Belarus': 'BLR', 'Kazakhstan': 'KAZ', 'Uzbekistan': 'UZB', 'Georgia': 'GEO',
    'Armenia': 'ARM', 'Azerbaijan': 'AZE', 'Peru': 'PER', 'Venezuela': 'VEN',
    'Ecuador': 'ECU', 'Bolivia': 'BOL', 'Paraguay': 'PRY', 'Uruguay': 'URY',
    'Costa Rica': 'CRI', 'Panama': 'PAN', 'Guatemala': 'GTM', 'Honduras': 'HND',
    'El Salvador': 'SLV', 'Nicaragua': 'NIC', 'Cuba': 'CUB', 'Jamaica': 'JAM',
    'Trinidad and Tobago': 'TTO', 'Taiwan': 'TWN', 'Iceland': 'ISL', 'Luxembourg': 'LUX',
    'Malta': 'MLT', 'Cyprus': 'CYP', 'Zambia': 'ZMB', 'Zimbabwe': 'ZWE',
    'Botswana': 'BWA', 'Namibia': 'NAM', 'Mozambique': 'MOZ', 'Madagascar': 'MDG',
    'Cameroon': 'CMR', 'Senegal': 'SEN', "Côte d'Ivoire": 'CIV', 'Ivory Coast': 'CIV',
    'Mali': 'MLI', 'Burkina Faso': 'BFA', 'Niger': 'NER', 'Chad': 'TCD',
    'Sudan': 'SDN', 'South Sudan': 'SSD', 'Somalia': 'SOM', 'Rwanda': 'RWA',
    'Burundi': 'BDI', 'Malawi': 'MWI', 'Angola': 'AGO', 'Congo': 'COG',
    'Democratic Republic of the Congo': 'COD', 'Gabon': 'GAB', 'Nepal': 'NPL',
    'Sri Lanka': 'LKA', 'Myanmar': 'MMR', 'Cambodia': 'KHM', 'Laos': 'LAO',
    'Afghanistan': 'AFG', 'Mongolia': 'MNG', 'Brunei': 'BRN', 'Macao': 'MAC',
    'Papua New Guinea': 'PNG', 'Fiji': 'FJI', 'Mauritius': 'MUS', 'Réunion': 'REU',
    'French Guiana': 'GUF', 'Martinique': 'MTQ', 'Guadeloupe': 'GLP',
    # Common variants and alternate names
    'Viet Nam': 'VNM', 'Republic of Korea': 'KOR', 'Korea': 'KOR',
    'Bosnia and Herzegovina': 'BIH', 'The Netherlands': 'NLD', 'Republic of Ireland': 'IRL',
    'USA': 'USA', 'UK': 'GBR', 'UAE': 'ARE', 'PRC': 'CHN',
    'England': 'GBR', 'Scotland': 'GBR', 'Wales': 'GBR', 'Northern Ireland': 'GBR',
    'Qatar': 'QAT', 'Kuwait': 'KWT', 'Bahrain': 'BHR', 'Oman': 'OMN',
    'Yemen': 'YEM', 'Syria': 'SYR', 'Libya': 'LBY', 'Barbados': 'BRB',
}
In [9]:
# 2b. Geographical Distribution - Activism Articles
# Now focus on activism-mentioning articles using the same country mapping

activism_articles = activism_articles.copy()  # Avoid SettingWithCopyWarning
activism_articles['Extracted_Country'] = activism_articles['Affiliations'].apply(extract_country)
country_activism_all = activism_articles['Extracted_Country'].value_counts()
country_activism_top30 = country_activism_all.head(30)

print("\nActivism Articles Geographic Distribution:")
print(f"Total activism articles with country information: {country_activism_all.sum():,}")
print(f"\nTop 30 countries (activism articles):")

# Bar chart - Activism articles
fig_activism = px.bar(country_activism_top30.reset_index(), x='Extracted_Country', y='count', 
             title='Top 30 Countries - Activism-Mentioning Articles',
             labels={'Extracted_Country': 'Country', 'count': 'Number of Articles'},
             color_discrete_sequence=['lightgreen'])
fig_activism.update_xaxes(tickangle=45)
pyo.iplot(fig_activism)

# World map - Activism articles
country_activism_df = country_activism_all.reset_index()
country_activism_df.columns = ['Extracted_Country', 'Actual_Count']
country_activism_df['Extracted_Country'] = country_activism_df['Extracted_Country'].str.strip()
country_activism_df['ISO3'] = country_activism_df['Extracted_Country'].map(country_iso_mapping)

unmapped_activism = country_activism_df[country_activism_df['ISO3'].isna()]
country_activism_df = country_activism_df[country_activism_df['ISO3'].notna()]
country_activism_df['count_log'] = np.log10(country_activism_df['Actual_Count'] + 1)

fig_map_activism = px.choropleth(country_activism_df, locations='ISO3', 
                        locationmode='ISO-3',
                        color='count_log',
                        hover_name='Extracted_Country',
                        hover_data={'count_log': ':.2f', 'Actual_Count': True, 'ISO3': False, 'Extracted_Country': False},
                        color_continuous_scale='Reds',
                        title='World Map - Activism-Mentioning Articles by Country',
                        labels={'count_log': 'Log10(Articles)', 'Actual_Count': 'Number of Articles'},
                        projection='natural earth')
fig_map_activism.update_layout(width=1280, height=720, geo=dict(showcountries=True, showcoastlines=True, showframe=False))
pyo.iplot(fig_map_activism)
Activism Articles Geographic Distribution:
Total activism articles with country information: 6,783

Top 30 countries (activism articles):
In [21]:
# 2a. Geographical Distribution - Full Dataset
# Analyze the complete dataset using the shared functions and country mapping

# Extract country for ALL articles
clean_df_with_country = clean_df.copy()
clean_df_with_country['Extracted_Country'] = clean_df_with_country['Affiliations'].apply(extract_country)
country_all_articles = clean_df_with_country['Extracted_Country'].value_counts()
country_all_top30 = country_all_articles.head(30)

print("Full Dataset Geographic Distribution:")
print(f"Total articles with country information: {country_all_articles.sum():,}")
print(f"\nTop 30 countries (all articles):")

# Bar chart - Full dataset
fig_all = px.bar(country_all_top30.reset_index(), x='Extracted_Country', y='count', 
             title='Top 30 Countries - Full Dataset (All Articles)',
             labels={'Extracted_Country': 'Country', 'count': 'Number of Articles'},
             color_discrete_sequence=['steelblue'])
fig_all.update_xaxes(tickangle=45)
pyo.iplot(fig_all)

# World map - Full dataset
country_all_df = country_all_articles.reset_index()
country_all_df.columns = ['Extracted_Country', 'Actual_Count']
country_all_df['Extracted_Country'] = country_all_df['Extracted_Country'].str.strip()
country_all_df['ISO3'] = country_all_df['Extracted_Country'].map(country_iso_mapping)

# Filter out unmapped countries
unmapped_all = country_all_df[country_all_df['ISO3'].isna()]
country_all_df_mapped = country_all_df[country_all_df['ISO3'].notna()].copy().reset_index(drop=True)
country_all_df_mapped['count_log'] = np.log10(country_all_df_mapped['Actual_Count'] + 1)

fig_map_all = px.choropleth(country_all_df_mapped, locations='ISO3', 
                        locationmode='ISO-3',
                        color='count_log',
                        hover_name='Extracted_Country',
                        hover_data={'count_log': ':.2f', 'Actual_Count': True, 'ISO3': False, 'Extracted_Country': False},
                        color_continuous_scale='YlOrRd',
                        title='World Map - Full Dataset (All Articles by Country)',
                        labels={'count_log': 'Log10(Articles)', 'Actual_Count': 'Number of Articles'},
                        projection='natural earth')
fig_map_all.update_layout(width=1280, height=720, geo=dict(showcountries=True, showcoastlines=True, showframe=False))
pyo.iplot(fig_map_all)
Full Dataset Geographic Distribution:
Total articles with country information: 12,815

Top 30 countries (all articles):
In [11]:
# 3. Prominence of Use
prominence_dist = clean_df['activism_prominence'].value_counts().sort_index()

# Combinations
combinations = clean_df.groupby(['activism_in_title', 'activism_in_abstract', 'activism_in_keywords']).size().sort_values(ascending=False)

# Correlation with year
if 'Year' in clean_df.columns:
    mean_prominence_by_year = clean_df.groupby('Year')['activism_prominence'].mean()

# Correlation with field
mean_prominence_by_field = clean_df.groupby('field')['activism_prominence'].mean().sort_values(ascending=False)
In [12]:
# Visualization: Activism Mentions by Field
field_activism_df = field_activism.reset_index()
field_activism_df.columns = ['field', 'count']
fig_field = px.bar(field_activism_df, 
                   x='field', y='count',
                   title='Activism Mentions by Academic Field',
                   labels={'field': 'Academic Field', 'count': 'Number of Articles'})
fig_field.update_xaxes(tickangle=45)
fig_field.update_layout(showlegend=False)
pyo.iplot(fig_field)
In [13]:
# Visualization: Prominence Score Distribution
fig_prom = px.bar(prominence_dist.reset_index(), 
                  x='activism_prominence', y='count',
                  title='Distribution of Activism Prominence Scores',
                  labels={'activism_prominence': 'Prominence Score (0-3)', 'count': 'Number of Articles'},
                  text='count')
fig_prom.update_traces(texttemplate='%{text:,}', textposition='outside')
fig_prom.update_layout(showlegend=False, xaxis_type='category')
pyo.iplot(fig_prom)

print("\nProminence Score Legend:")
print("0 = No mention of 'activism'")
print("1 = Appears in 1 location (title, abstract, or keywords)")
print("2 = Appears in 2 locations")
print("3 = Appears in all 3 locations (title, abstract, and keywords)")
Prominence Score Legend:
0 = No mention of 'activism'
1 = Appears in 1 location (title, abstract, or keywords)
2 = Appears in 2 locations
3 = Appears in all 3 locations (title, abstract, and keywords)
In [14]:
# Visualization: Prominence Combinations
# Create a more readable version of combinations
combinations_df = combinations.reset_index()
combinations_df.columns = ['Title', 'Abstract', 'Keywords', 'Count']

# Use "AND" to make it clear these are combined conditions
combinations_df['Combination'] = combinations_df.apply(
    lambda row: ' AND '.join([label for label, present in 
                              [('Title', row['Title']), 
                               ('Abstract', row['Abstract']), 
                               ('Keywords', row['Keywords'])] 
                              if present]) or 'None', axis=1
)

# Filter out the "None" (False, False, False) for better visualization
combinations_filtered = combinations_df[combinations_df['Combination'] != 'None'].sort_values('Count', ascending=True)

if not combinations_filtered.empty:
    fig_comb = px.bar(combinations_filtered,
                      x='Count', y='Combination',
                      orientation='h',
                      title='Combinations of "Activism" Presence in Metadata Fields',
                      labels={'Count': 'Number of Articles', 'Combination': 'Presence Combination'},
                      text='Count')
    
    fig_comb.update_traces(texttemplate='%{text:,}', textposition='outside')
    fig_comb.update_layout(showlegend=False, height=400)
    pyo.iplot(fig_comb)
else:
    print("No combinations found with activism mentions.")
In [15]:
# Let's examine the actual combinations more clearly
print("Detailed breakdown of activism presence combinations:")
print("="*100)

# Show each unique combination with its exact boolean pattern
for idx, row in combinations_df.iterrows():
    title = row['Title']
    abstract = row['Abstract']
    keywords = row['Keywords']
    combination = row['Combination']
    count = row['Count']
    
    print(f"\nPattern: Title={title}, Abstract={abstract}, Keywords={keywords}")
    print(f"Label: {combination}")
    print(f"Count: {count:,} articles")
    print(f"Explanation: Activism appears {'in title' if title else 'NOT in title'}, "
          f"{'in abstract' if abstract else 'NOT in abstract'}, "
          f"{'in keywords' if keywords else 'NOT in keywords'}")
    
print("\n" + "="*100)
print("Note: Each combination is MUTUALLY EXCLUSIVE - articles are counted in exactly ONE category.")
print("'Title AND Keywords' means activism is in those two places BUT NOT in abstract.")
print("'Title AND Abstract AND Keywords' means activism is in ALL THREE places.")
Detailed breakdown of activism presence combinations:
====================================================================================================

Pattern: Title=False, Abstract=False, Keywords=False
Label: None
Count: 6,725 articles
Explanation: Activism appears NOT in title, NOT in abstract, NOT in keywords

Pattern: Title=False, Abstract=True, Keywords=False
Label: Abstract
Count: 3,991 articles
Explanation: Activism appears NOT in title, in abstract, NOT in keywords

Pattern: Title=False, Abstract=False, Keywords=True
Label: Keywords
Count: 982 articles
Explanation: Activism appears NOT in title, NOT in abstract, in keywords

Pattern: Title=False, Abstract=True, Keywords=True
Label: Abstract AND Keywords
Count: 652 articles
Explanation: Activism appears NOT in title, in abstract, in keywords

Pattern: Title=True, Abstract=False, Keywords=False
Label: Title
Count: 633 articles
Explanation: Activism appears in title, NOT in abstract, NOT in keywords

Pattern: Title=True, Abstract=True, Keywords=False
Label: Title AND Abstract
Count: 547 articles
Explanation: Activism appears in title, in abstract, NOT in keywords

Pattern: Title=True, Abstract=True, Keywords=True
Label: Title AND Abstract AND Keywords
Count: 493 articles
Explanation: Activism appears in title, in abstract, in keywords

Pattern: Title=True, Abstract=False, Keywords=True
Label: Title AND Keywords
Count: 179 articles
Explanation: Activism appears in title, NOT in abstract, in keywords

====================================================================================================
Note: Each combination is MUTUALLY EXCLUSIVE - articles are counted in exactly ONE category.
'Title AND Keywords' means activism is in those two places BUT NOT in abstract.
'Title AND Abstract AND Keywords' means activism is in ALL THREE places.
In [16]:
# Visualization: Prominence Trends Over Time
if 'Year' in clean_df.columns and not mean_prominence_by_year.empty:
    fig_time = px.bar(mean_prominence_by_year.reset_index(), 
                      x='Year', y='activism_prominence',
                      title='Mean Activism Prominence Score Over Time',
                      labels={'Year': 'Publication Year', 'activism_prominence': 'Mean Prominence Score'})
    fig_time.update_layout(showlegend=False)
    fig_time.update_xaxes(tickmode='array', tickvals=mean_prominence_by_year.index.tolist(), ticktext=mean_prominence_by_year.index.tolist())
    pyo.iplot(fig_time)
    
    # Additional line plot showing count of articles by prominence score over time
    activism_by_year = clean_df[clean_df['activism_prominence'] > 0].copy()
    if not activism_by_year.empty:
        # Group by year and prominence score, count articles
        counts_by_year_prominence = activism_by_year.groupby(['Year', 'activism_prominence']).size().reset_index(name='count')
        
        # Create line plot with separate lines for each prominence score
        fig_lines = px.line(counts_by_year_prominence, 
                           x='Year', y='count',
                           color='activism_prominence',
                           title='Count of Articles by Prominence Score Over Time',
                           labels={'Year': 'Publication Year', 'count': 'Number of Articles', 'activism_prominence': 'Prominence Score'},
                           markers=True,
                           category_orders={'activism_prominence': [1, 2, 3]})
        fig_lines.update_xaxes(tickmode='array', tickvals=counts_by_year_prominence['Year'].unique().tolist())
        pyo.iplot(fig_lines)
else:
    print("Year data not available for prominence trend analysis.")
In [17]:
# Visualization: Prominence by Field (grouped bars + mean overlay)
# Prepare data: count of articles by field and prominence score
field_prominence_counts = clean_df.groupby(['field', 'activism_prominence']).size().reset_index(name='count')

# Create grouped bar chart using graph_objects for better control
import plotly.graph_objects as go
from plotly.subplots import make_subplots

# Get unique fields sorted by mean prominence
fields_sorted = mean_prominence_by_field.sort_values(ascending=False).index.tolist()

# Create figure with secondary y-axis
fig_field_prom = make_subplots(specs=[[{"secondary_y": True}]])

# Add bars for each prominence score
colors = ['#d3d3d3', '#90caf9', '#42a5f5', '#1976d2']
for i, prominence in enumerate([0, 1, 2, 3]):
    data = field_prominence_counts[field_prominence_counts['activism_prominence'] == prominence]
    # Reindex to match sorted fields order
    data = data.set_index('field').reindex(fields_sorted, fill_value=0).reset_index()
    
    fig_field_prom.add_trace(
        go.Bar(
            name=f'Prominence {prominence}',
            x=data['field'],
            y=data['count'],
            marker_color=colors[i],
            showlegend=True
        ),
        secondary_y=False
    )

# Add mean prominence as line with markers on secondary y-axis
mean_by_field = mean_prominence_by_field.reindex(fields_sorted)
fig_field_prom.add_trace(
    go.Scatter(
        name='Mean Prominence',
        x=fields_sorted,
        y=mean_by_field.values,
        mode='markers+lines',
        marker=dict(size=12, color='red', symbol='diamond', line=dict(width=2, color='darkred')),
        line=dict(color='red', width=2, dash='dash'),
        yaxis='y2',
        showlegend=True
    ),
    secondary_y=True
)

# Update layout
fig_field_prom.update_xaxes(title_text="Academic Field", tickangle=45)
fig_field_prom.update_yaxes(title_text="Number of Articles", secondary_y=False)
fig_field_prom.update_yaxes(title_text="Mean Prominence Score", secondary_y=True, range=[0, 3])

fig_field_prom.update_layout(
    title='Activism Prominence Distribution by Academic Field (with Mean Overlay)',
    barmode='group',
    height=600,
    legend=dict(orientation='v', yanchor='top', y=1, xanchor='right', x=1.15)
)

pyo.iplot(fig_field_prom)

# Additional: Cross-tabulation table
field_prominence_crosstab = pd.crosstab(clean_df['field'], clean_df['activism_prominence'])
print("\nCross-tabulation of Field vs Prominence Score:")
print(field_prominence_crosstab)
Cross-tabulation of Field vs Prominence Score:
activism_prominence     0     1    2    3
field                                    
epidemiology          560   515  154   74
literature           1621  1498  312  115
medicine             2098  1530  391  142
plant science         138    77   13    5
religious studies    1663  1518  353  111
virology              645   468  155   46

Export Notebook to HTML¶

Run the cell below to export this notebook to HTML with embedded interactive plots and code toggle functionality.

In [18]:
import subprocess
import os

# Export notebook to HTML with embedded plots
notebook_name = 'bibliometric_data_deduplication_analysis.ipynb'
output_html = 'bibliometric_data_deduplication_analysis_with_plots.html'

print("Exporting notebook to HTML with embedded Plotly visualizations...")
result = subprocess.run(
    ['jupyter', 'nbconvert', '--to', 'html', '--template', 'lab', 
     notebook_name, '--output', output_html],
    capture_output=True,
    text=True
)

if result.returncode == 0:
    print(f"✓ Successfully exported to {output_html}")
    
    # Apply code toggle functionality
    if os.path.exists('add_code_toggle.py'):
        print("\nAdding code toggle functionality...")
        result2 = subprocess.run(
            ['python', 'add_code_toggle.py', output_html],
            capture_output=True,
            text=True
        )
        
        if result2.returncode == 0:
            toggle_html = output_html.replace('.html', '_with_toggle.html')
            print(f"✓ Successfully created {toggle_html} with code toggle buttons")
            print(f"\nTwo HTML files created:")
            print(f"  1. {output_html} (with code visible)")
            print(f"  2. {toggle_html} (with code hidden by default, toggle button available)")
        else:
            print(f"⚠ Code toggle script failed: {result2.stderr}")
    else:
        print("\n⚠ add_code_toggle.py script not found - skipping code toggle functionality")
        print(f"HTML exported to: {output_html}")
else:
    print(f"✗ Export failed: {result.stderr}")
Exporting notebook to HTML with embedded Plotly visualizations...
✓ Successfully exported to bibliometric_data_deduplication_analysis_with_plots.html

Adding code toggle functionality...
✓ Successfully created bibliometric_data_deduplication_analysis_with_plots_with_toggle.html with code toggle buttons

Two HTML files created:
  1. bibliometric_data_deduplication_analysis_with_plots.html (with code visible)
  2. bibliometric_data_deduplication_analysis_with_plots_with_toggle.html (with code hidden by default, toggle button available)