Key Updates:
- Sample Size:
- Increased to 1 million (self.sample_size = 1_000_000).
- Visualizations handle this scale, with the scatter plot subsampling to 10,000 points for clarity.
- Real Data Integration:
- Cancer Types: Uses GLOBOCAN 2022-inspired distribution (Lung: 30%, Breast: 25%, Prostate: 20%, Colorectal: 25%).
- Age: Normal distribution centered at 60 (reflecting higher cancer incidence in older populations).
- Gender: Adjusted for cancer type (e.g., Prostate all Male, Breast 99% Female).
- Limitations: Weight, lifestyle, stage, treatment response, and treatment type are simulated due to lack of public patient-level data. Distributions are based on general knowledge (e.g., 40% early-stage, 35% chemo).
- Plots:
- All 7 plots are included:
- Age Distribution (histogram)
- Gender Distribution (bar)
- Weight Distribution (histogram)
- Ethnicity Distribution (bar)
- Lifestyle Distribution (bar)
- Age vs Weight by Gender (scatter, subsampled)
- Treatment Overview (stacked bar)
CODE:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime
import requests
class CancerDataVisualizer:
def __init__(self):
self.sample_size = 1_000_000 # 1 million
self.data = self._generate_hybrid_data()
self.figsize = (12, 8)
def _fetch_globocan_data(self):
"""Fetch real cancer incidence data from GLOBOCAN 2022 (simplified)"""
# Note: GLOBOCAN API requires specific access; using static data from gco.iarc.fr for demo
# Real data: https://gco.iarc.fr/today/data/factsheets/cancers/39-All-cancers-fact-sheet.pdf
cancer_types = ['Lung', 'Breast', 'Prostate', 'Colorectal']
incidence_probs = [0.30, 0.25, 0.20, 0.25] # Approximated from GLOBOCAN 2022
return cancer_types, incidence_probs
def _generate_hybrid_data(self):
"""Generate hybrid data with real and simulated components"""
np.random.seed(42)
# Real data-informed components
cancer_types, cancer_probs = self._fetch_globocan_data()
# Age distribution (informed by GLOBOCAN: higher incidence in older ages)
age = np.random.normal(60, 15, self.sample_size).clip(20, 90)
# Gender (roughly equal, adjusted for cancer type later)
gender = np.random.choice(['Male', 'Female'], self.sample_size, p=[0.51, 0.49])
# Weight (normal distribution, mean 75kg, adjusted by gender)
weight = np.where(
gender == 'Male',
np.random.normal(80, 15, self.sample_size).clip(50, 130),
np.random.normal(70, 15, self.sample_size).clip(40, 120)
)
# Ethnicity (global distribution, approximated)
ethnicity = np.random.choice(
['Asian', 'Caucasian', 'African', 'Hispanic'],
self.sample_size,
p=[0.40, 0.35, 0.15, 0.10] # Reflecting global population trends
)
# Lifestyle (informed by cancer risk factors)
lifestyle = np.random.choice(
['Smoker', 'Non-smoker', 'Active', 'Sedentary'],
self.sample_size,
p=[0.20, 0.30, 0.25, 0.25]
)
# Cancer stage (realistic distribution)
cancer_stage = np.random.choice(
['Early', 'Mid', 'Late'],
self.sample_size,
p=[0.40, 0.35, 0.25]
)
# Treatment response (simulated, realistic proportions)
treatment_response = np.random.choice(
['Positive', 'Neutral', 'Negative'],
self.sample_size,
p=[0.50, 0.30, 0.20]
)
# Treatment type (common treatments, informed by literature)
treatment_type = np.random.choice(
['Chemotherapy', 'Radiation', 'Surgery', 'Immunotherapy'],
self.sample_size,
p=[0.35, 0.25, 0.25, 0.15]
)
# Cancer type (real data-informed)
cancer_type = np.random.choice(cancer_types, self.sample_size, p=cancer_probs)
# Adjust gender for cancer type (e.g., Prostate = Male, Breast = mostly Female)
gender = np.where(cancer_type == 'Prostate', 'Male', gender)
gender = np.where(cancer_type == 'Breast',
np.random.choice(['Male', 'Female'], self.sample_size, p=[0.01, 0.99]),
gender)
return pd.DataFrame({
'age': age,
'gender': gender,
'weight': weight,
'ethnicity': ethnicity,
'lifestyle': lifestyle,
'cancer_stage': cancer_stage,
'treatment_response': treatment_response,
'treatment_type': treatment_type,
'cancer_type': cancer_type
})
def plot_age_distribution(self):
plt.figure(figsize=self.figsize)
sns.histplot(data=self.data, x='age', bins=50, kde=True, color='blue')
plt.title('Age Distribution of Cancer Patients (1M Sample)', fontsize=16)
plt.xlabel('Age (years)', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.grid(True, alpha=0.3)
plt.show()
def plot_gender_distribution(self):
plt.figure(figsize=self.figsize)
sns.countplot(data=self.data, x='gender', palette='Set2')
plt.title('Gender Distribution of Cancer Patients (1M Sample)', fontsize=16)
plt.xlabel('Gender', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.show()
def plot_weight_distribution(self):
plt.figure(figsize=self.figsize)
sns.histplot(data=self.data, x='weight', bins=50, kde=True, color='green')
plt.title('Weight Distribution of Cancer Patients (1M Sample)', fontsize=16)
plt.xlabel('Weight (kg)', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.grid(True, alpha=0.3)
plt.show()
def plot_ethnicity_distribution(self):
plt.figure(figsize=self.figsize)
sns.countplot(data=self.data, x='ethnicity', palette='Set3')
plt.title('Ethnicity Distribution of Cancer Patients (1M Sample)', fontsize=16)
plt.xlabel('Ethnicity', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=45)
plt.show()
def plot_lifestyle_distribution(self):
plt.figure(figsize=self.figsize)
sns.countplot(data=self.data, x='lifestyle', palette='Set1')
plt.title('Lifestyle Distribution of Cancer Patients (1M Sample)', fontsize=16)
plt.xlabel('Lifestyle', fontsize=12)
plt.ylabel('Count', fontsize=12)
plt.xticks(rotation=45)
plt.show()
def plot_age_weight_relationship(self):
plt.figure(figsize=self.figsize)
# Subsample for scatter plot clarity
sample = self.data.sample(10000)
sns.scatterplot(data=sample, x='age', y='weight', hue='gender', size='weight', alpha=0.6)
plt.title('Age vs Weight by Gender (10K Subsample of 1M)', fontsize=16)
plt.xlabel('Age (years)', fontsize=12)
plt.ylabel('Weight (kg)', fontsize=12)
plt.grid(True, alpha=0.3)
plt.show()
def plot_treatment_overview(self):
plt.figure(figsize=(15, 10))
ct = pd.crosstab(
index=[self.data['cancer_type'], self.data['cancer_stage']],
columns=[self.data['treatment_type'], self.data['treatment_response']]
)
ct.plot(kind='bar', stacked=True, figsize=(15, 10), colormap='tab20')
plt.title('Cancer Treatment Overview by Type and Stage (1M Sample)', fontsize=16)
plt.xlabel('Cancer Type and Stage', fontsize=12)
plt.ylabel('Number of Patients', fontsize=12)
plt.xticks(rotation=45, ha='right')
plt.legend(title='Treatment & Response', bbox_to_anchor=(1.05, 1), loc='upper left')
plt.tight_layout()
plt.show()
def main():
visualizer = CancerDataVisualizer()
print(f"Generating visualizations for cancer patient data (Sample Size: {len(visualizer.data):,})...")
visualizer.plot_age_distribution()
visualizer.plot_gender_distribution()
visualizer.plot_weight_distribution()
visualizer.plot_ethnicity_distribution()
visualizer.plot_lifestyle_distribution()
visualizer.plot_age_weight_relationship()
visualizer.plot_treatment_overview()
print(f"Visualizations completed on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
if __name__ == "__main__":
main()
PLOTS:
No comments:
Post a Comment