Devise Foundation: Global Cancer Data Visualizer

Key Updates:

Sample Size:
- Increased to 1 million (self.sample_size = 1_000_000).
- Visualizations handle this scale, with the scatter plot subsampling to 10,000 points for clarity.
Real Data Integration:
- Cancer Types: Uses GLOBOCAN 2022-inspired distribution (Lung: 30%, Breast: 25%, Prostate: 20%, Colorectal: 25%).
- Age: Normal distribution centered at 60 (reflecting higher cancer incidence in older populations).
- Gender: Adjusted for cancer type (e.g., Prostate all Male, Breast 99% Female).
- Limitations: Weight, lifestyle, stage, treatment response, and treatment type are simulated due to lack of public patient-level data. Distributions are based on general knowledge (e.g., 40% early-stage, 35% chemo).
- Plots:
  - All 7 plots are included:
    1. Age Distribution (histogram)
    2. Gender Distribution (bar)
    3. Weight Distribution (histogram)
    4. Ethnicity Distribution (bar)
    5. Lifestyle Distribution (bar)
    6. Age vs Weight by Gender (scatter, subsampled)
    7. Treatment Overview (stacked bar)

CODE:

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import requests

class CancerDataVisualizer:
    def __init__(self):
        self.sample_size = 1_000_000  # 1 million
        self.data = self._generate_hybrid_data()
        self.figsize = (12, 8)

    def _fetch_globocan_data(self):
        """Fetch real cancer incidence data from GLOBOCAN 2022 (simplified)"""
        # Note: GLOBOCAN API requires specific access; using static data from gco.iarc.fr for demo
        # Real data: https://gco.iarc.fr/today/data/factsheets/cancers/39-All-cancers-fact-sheet.pdf
        cancer_types = ['Lung', 'Breast', 'Prostate', 'Colorectal']
        incidence_probs = [0.30, 0.25, 0.20, 0.25]  # Approximated from GLOBOCAN 2022
        return cancer_types, incidence_probs

    def _generate_hybrid_data(self):
        """Generate hybrid data with real and simulated components"""
        np.random.seed(42)
        
        # Real data-informed components
        cancer_types, cancer_probs = self._fetch_globocan_data()
        
        # Age distribution (informed by GLOBOCAN: higher incidence in older ages)
        age = np.random.normal(60, 15, self.sample_size).clip(20, 90)
        
        # Gender (roughly equal, adjusted for cancer type later)
        gender = np.random.choice(['Male', 'Female'], self.sample_size, p=[0.51, 0.49])
        
        # Weight (normal distribution, mean 75kg, adjusted by gender)
        weight = np.where(
            gender == 'Male',
            np.random.normal(80, 15, self.sample_size).clip(50, 130),
            np.random.normal(70, 15, self.sample_size).clip(40, 120)
        )
        
        # Ethnicity (global distribution, approximated)
        ethnicity = np.random.choice(
            ['Asian', 'Caucasian', 'African', 'Hispanic'],
            self.sample_size,
            p=[0.40, 0.35, 0.15, 0.10]  # Reflecting global population trends
        )
        
        # Lifestyle (informed by cancer risk factors)
        lifestyle = np.random.choice(
            ['Smoker', 'Non-smoker', 'Active', 'Sedentary'],
            self.sample_size,
            p=[0.20, 0.30, 0.25, 0.25]
        )
        
        # Cancer stage (realistic distribution)
        cancer_stage = np.random.choice(
            ['Early', 'Mid', 'Late'],
            self.sample_size,
            p=[0.40, 0.35, 0.25]
        )
        
        # Treatment response (simulated, realistic proportions)
        treatment_response = np.random.choice(
            ['Positive', 'Neutral', 'Negative'],
            self.sample_size,
            p=[0.50, 0.30, 0.20]
        )
        
        # Treatment type (common treatments, informed by literature)
        treatment_type = np.random.choice(
            ['Chemotherapy', 'Radiation', 'Surgery', 'Immunotherapy'],
            self.sample_size,
            p=[0.35, 0.25, 0.25, 0.15]
        )
        
        # Cancer type (real data-informed)
        cancer_type = np.random.choice(cancer_types, self.sample_size, p=cancer_probs)
        
        # Adjust gender for cancer type (e.g., Prostate = Male, Breast = mostly Female)
        gender = np.where(cancer_type == 'Prostate', 'Male', gender)
        gender = np.where(cancer_type == 'Breast', 
                         np.random.choice(['Male', 'Female'], self.sample_size, p=[0.01, 0.99]), 
                         gender)

        return pd.DataFrame({
            'age': age,
            'gender': gender,
            'weight': weight,
            'ethnicity': ethnicity,
            'lifestyle': lifestyle,
            'cancer_stage': cancer_stage,
            'treatment_response': treatment_response,
            'treatment_type': treatment_type,
            'cancer_type': cancer_type
        })

    def plot_age_distribution(self):
        plt.figure(figsize=self.figsize)
        sns.histplot(data=self.data, x='age', bins=50, kde=True, color='blue')
        plt.title('Age Distribution of Cancer Patients (1M Sample)', fontsize=16)
        plt.xlabel('Age (years)', fontsize=12)
        plt.ylabel('Count', fontsize=12)
        plt.grid(True, alpha=0.3)
        plt.show()

    def plot_gender_distribution(self):
        plt.figure(figsize=self.figsize)
        sns.countplot(data=self.data, x='gender', palette='Set2')
        plt.title('Gender Distribution of Cancer Patients (1M Sample)', fontsize=16)
        plt.xlabel('Gender', fontsize=12)
        plt.ylabel('Count', fontsize=12)
        plt.show()

    def plot_weight_distribution(self):
        plt.figure(figsize=self.figsize)
        sns.histplot(data=self.data, x='weight', bins=50, kde=True, color='green')
        plt.title('Weight Distribution of Cancer Patients (1M Sample)', fontsize=16)
        plt.xlabel('Weight (kg)', fontsize=12)
        plt.ylabel('Count', fontsize=12)
        plt.grid(True, alpha=0.3)
        plt.show()

    def plot_ethnicity_distribution(self):
        plt.figure(figsize=self.figsize)
        sns.countplot(data=self.data, x='ethnicity', palette='Set3')
        plt.title('Ethnicity Distribution of Cancer Patients (1M Sample)', fontsize=16)
        plt.xlabel('Ethnicity', fontsize=12)
        plt.ylabel('Count', fontsize=12)
        plt.xticks(rotation=45)
        plt.show()

    def plot_lifestyle_distribution(self):
        plt.figure(figsize=self.figsize)
        sns.countplot(data=self.data, x='lifestyle', palette='Set1')
        plt.title('Lifestyle Distribution of Cancer Patients (1M Sample)', fontsize=16)
        plt.xlabel('Lifestyle', fontsize=12)
        plt.ylabel('Count', fontsize=12)
        plt.xticks(rotation=45)
        plt.show()

    def plot_age_weight_relationship(self):
        plt.figure(figsize=self.figsize)
        # Subsample for scatter plot clarity
        sample = self.data.sample(10000)
        sns.scatterplot(data=sample, x='age', y='weight', hue='gender', size='weight', alpha=0.6)
        plt.title('Age vs Weight by Gender (10K Subsample of 1M)', fontsize=16)
        plt.xlabel('Age (years)', fontsize=12)
        plt.ylabel('Weight (kg)', fontsize=12)
        plt.grid(True, alpha=0.3)
        plt.show()

    def plot_treatment_overview(self):
        plt.figure(figsize=(15, 10))
        ct = pd.crosstab(
            index=[self.data['cancer_type'], self.data['cancer_stage']],
            columns=[self.data['treatment_type'], self.data['treatment_response']]
        )
        ct.plot(kind='bar', stacked=True, figsize=(15, 10), colormap='tab20')
        plt.title('Cancer Treatment Overview by Type and Stage (1M Sample)', fontsize=16)
        plt.xlabel('Cancer Type and Stage', fontsize=12)
        plt.ylabel('Number of Patients', fontsize=12)
        plt.xticks(rotation=45, ha='right')
        plt.legend(title='Treatment & Response', bbox_to_anchor=(1.05, 1), loc='upper left')
        plt.tight_layout()
        plt.show()

def main():
    visualizer = CancerDataVisualizer()
    print(f"Generating visualizations for cancer patient data (Sample Size: {len(visualizer.data):,})...")
    
    visualizer.plot_age_distribution()
    visualizer.plot_gender_distribution()
    visualizer.plot_weight_distribution()
    visualizer.plot_ethnicity_distribution()
    visualizer.plot_lifestyle_distribution()
    visualizer.plot_age_weight_relationship()
    visualizer.plot_treatment_overview()
    
    print(f"Visualizations completed on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")

if __name__ == "__main__":
    main()

PLOTS:

Devise Foundation

Monday, March 17, 2025

Global Cancer Data Visualizer

No comments:

Post a Comment

Reviving Life and Redefining Medicine: The ConsciousLeaf Vision

Report Abuse

Labels

Popular Posts