Powered By Blogger

Monday, February 24, 2025

Disease-gene association code

CODE:

import pandas as pd import numpy as np from sklearn.model_selection import train_test_split, KFold, GridSearchCV from sklearn.preprocessing import StandardScaler from sklearn.svm import SVC from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from sklearn.metrics import accuracy_score, classification_report, make_scorer, f1_score from sklearn.pipeline import Pipeline from sklearn.impute import SimpleImputer import warnings warnings.filterwarnings("ignore") # Raw data string containing disease information data = """ DOID:11054 Urinary bladder cancer 5 of 38 2.13 2.53 1.15e-06 DOID:5672 Large intestine cancer 5 of 44 2.07 2.49 1.15e-06 DOID:5093 Thoracic cancer 5 of 71 1.86 2.13 4.77e-06 DOID:9256 Colorectal cancer 4 of 32 2.11 2.1 1.36e-05 DOID:707 B-cell lymphoma 4 of 34 2.09 2.07 1.57e-05 DOID:9561 Nasopharyngeal disease 3 of 9 2.54 1.99 4.49e-05 DOID:0060119 Pharynx cancer 3 of 12 2.41 1.84 8.75e-05 DOID:0060058 Lymphoma 5 of 105 1.69 1.82 1.77e-05 DOID:3910 Lung adenocarcinoma 3 of 13 2.38 1.82 9.71e-05 DOID:4905 Pancreatic carcinoma 3 of 15 2.32 1.77 0.00012 DOID:2531 Hematologic cancer 6 of 190 1.52 1.71 1.11e-05 DOID:684 Hepatocellular carcinoma 3 of 17 2.26 1.7 0.00016 DOID:219 Colon cancer 3 of 17 2.26 1.7 0.00016 DOID:1795 Tumor of exocrine pancreas 3 of 17 2.26 1.7 0.00016 DOID:1612 Breast cancer 4 of 62 1.83 1.65 9.71e-05 DOID:3908 Lung non-small cell carcinoma 3 of 26 2.08 1.51 0.00037 DOID:11166 Obsolete papillomavirus infectious disease 2 of 2 3.02 1.5 0.00062 DOID:0050615 Respiratory system cancer 4 of 81 1.71 1.49 0.00019 DOID:170 Endocrine gland cancer 4 of 93 1.65 1.4 0.00028 DOID:0050686 Organ system cancer 10 of 757 1.14 1.37 1.15e-06 DOID:9261 Nasopharynx carcinoma 2 of 4 2.71 1.33 0.0014 DOID:1240 Leukemia 4 of 104 1.6 1.32 0.00041 DOID:8584 Burkitt lymphoma 2 of 5 2.62 1.27 0.0018 DOID:12704 Ataxia telangiectasia 2 of 5 2.62 1.27 0.0018 DOID:162 Cancer 11 of 978 1.07 1.26 1.15e-06 DOID:1909 Melanoma 3 of 46 1.83 1.22 0.0014 DOID:3498 Pancreatic ductal adenocarcinoma 2 of 6 2.54 1.22 0.0023 DOID:3012 Li-Fraumeni syndrome 2 of 6 2.54 1.22 0.0023 DOID:8557 Oropharynx cancer 2 of 7 2.47 1.18 0.0028 DOID:1037 Lymphoid leukemia 3 of 53 1.77 1.15 0.0019 DOID:2893 Cervix carcinoma 2 of 9 2.36 1.12 0.0036 DOID:3347 Osteosarcoma 2 of 10 2.32 1.09 0.0041 DOID:0060108 Brain glioma 2 of 10 2.32 1.09 0.0041 DOID:4159 Skin cancer 3 of 63 1.69 1.07 0.0028 DOID:5520 Head and neck squamous cell carcinoma 2 of 11 2.28 1.06 0.0047 DOID:8923 Skin melanoma 2 of 12 2.24 1.03 0.0054 DOID:0050621 Respiratory system benign neoplasm 2 of 12 2.24 1.03 0.0054 DOID:345 Uterine disease 3 of 72 1.64 1.01 0.0036 DOID:0050687 Cell type cancer 6 of 451 1.14 1.0 0.00041 DOID:305 Carcinoma 5 of 307 1.23 0.99 0.0010 DOID:786 Laryngeal disease 2 of 14 2.17 0.99 0.0068 DOID:3068 Glioblastoma multiforme 2 of 14 2.17 0.99 0.0068 DOID:768 Retinoblastoma 2 of 16 2.11 0.95 0.0080 DOID:4001 Ovarian carcinoma 2 of 16 2.11 0.95 0.0080 DOID:229 Female reproductive system disease 4 of 192 1.33 0.94 0.0028 DOID:0050745 Diffuse large B-cell lymphoma 2 of 17 2.09 0.93 0.0086 DOID:8618 Oral cavity cancer 2 of 18 2.06 0.92 0.0094 DOID:120 Female reproductive organ cancer 3 of 100 1.49 0.86 0.0077 DOID:77 Gastrointestinal system disease 6 of 576 1.03 0.82 0.0014 DOID:9952 Acute lymphoblastic leukemia 2 of 26 1.9 0.8 0.0165 DOID:2513 Basal cell carcinoma 2 of 27 1.89 0.78 0.0176 DOID:3459 Breast carcinoma 2 of 29 1.85 0.76 0.0195 DOID:403 Mouth disease 3 of 130 1.38 0.74 0.0137 DOID:10534 Stomach cancer 2 of 31 1.83 0.74 0.0216 DOID:289 Endometriosis 2 of 34 1.79 0.71 0.0252 DOID:0070004 Myeloid neoplasm 2 of 38 1.74 0.68 0.0294 DOID:6713 Cerebrovascular disease 2 of 46 1.65 0.61 0.0400 DOID:28 Endocrine system disease 4 of 398 1.02 0.55 0.0243 DOID:225 Syndrome 6 of 1214 0.71 0.41 0.0315 DOID:7 Disease of anatomical entity 12 of 4798 0.41 0.31 0.0185 """ # Function to parse the raw data into a structured format def parse_data(data): """ Parse the raw data string into a list of lists containing numerical values and disease names. Args: data (str): Raw data string with lines containing DOID, disease name, successes of total, and values. Returns: list: List of lists, each containing [successes, total, value1, value2, p_value, disease]. """ lines = data.strip().split('\n') data_list = [] for line in lines: parts = line.split() # Split on any whitespace # Find the last occurrence of 'of' by searching from the end i = parts[::-1].index('of') # Find 'of' from the reversed list i = len(parts) - i - 1 # Convert to original index disease = ' '.join(parts[1:i-1]) # Disease name from after DOID to before successes successes = int(parts[i-1]) # Number before 'of' total = int(parts[i+1]) # Number after 'of' value1 = float(parts[i+2]) # First number after total value2 = float(parts[i+3]) # Second number p_value = float(parts[i+4]) # Third number (p-value) data_list.append([successes, total, value1, value2, p_value, disease]) return data_list # Parse data into a DataFrame data_list = parse_data(data) df = pd.DataFrame(data_list, columns=['Successes', 'Total', 'Value1', 'Value2', 'P_Value', 'Disease']) # Feature Engineering df['Accuracy'] = df['Successes'] / df['Total'] # Success rate df['LogTotal'] = np.log1p(df['Total']) # Log of total cases df['LogPValue'] = -np.log10(df['P_Value']) # Negative log10 of p-value # Define cancer-related terms for labeling cancer_terms = ["cancer", "carcinoma", "melanoma", "lymphoma", "leukemia", "sarcoma", "glioma", "adenocarcinoma", "retinoblastoma", "glioblastoma"] # Create target variable (1 for cancer, 0 for non-cancer) y = np.array([1 if any(term in disease.lower() for term in cancer_terms) else 0 for disease in df['Disease']]) # Define feature set X = df[['Successes', 'Total', 'Value1', 'Value2', 'P_Value', 'Accuracy', 'LogTotal', 'LogPValue']] # Split data into training and testing sets X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) # Custom F1 scorer to handle zero division def safe_f1(y_true, y_pred): return f1_score(y_true, y_pred, average='binary', zero_division=0) custom_f1 = make_scorer(safe_f1) # --- Model Pipelines and Tuning --- # SVM Pipeline def create_svm_pipeline(): return Pipeline([ ('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler()), ('svm', SVC(random_state=42, class_weight='balanced')) ]) def tune_svm(X_train, y_train): param_grid = { 'svm__C': [0.1, 1, 10], 'svm__kernel': ['rbf', 'linear'], 'svm__gamma': ['scale', 'auto', 0.1, 1] } grid_search = GridSearchCV(create_svm_pipeline(), param_grid, scoring=custom_f1, cv=KFold(n_splits=10, shuffle=True, random_state=42), n_jobs=-1) grid_search.fit(X_train, y_train) return grid_search.best_estimator_ # Random Forest Pipeline def create_rf_pipeline(): return Pipeline([ ('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler()), ('rf', RandomForestClassifier(random_state=42, class_weight='balanced')) ]) def tune_rf(X_train, y_train): param_grid = { 'rf__n_estimators': [50, 100, 200], 'rf__max_depth': [3, 5, 7], 'rf__min_samples_leaf': [1, 5, 10] } grid_search = GridSearchCV(create_rf_pipeline(), param_grid, scoring=custom_f1, cv=KFold(n_splits=10, shuffle=True, random_state=42), n_jobs=-1) grid_search.fit(X_train, y_train) return grid_search.best_estimator_ # Gradient Boosting Pipeline def create_gb_pipeline(): return Pipeline([ ('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler()), ('gb', GradientBoostingClassifier(random_state=42)) ]) def tune_gb(X_train, y_train): param_grid = { 'gb__n_estimators': [50, 100, 200], 'gb__learning_rate': [0.01, 0.1, 0.2], 'gb__max_depth': [3, 5, 7] } grid_search = GridSearchCV(create_gb_pipeline(), param_grid, scoring=custom_f1, cv=KFold(n_splits=10, shuffle=True, random_state=42), n_jobs=-1) grid_search.fit(X_train, y_train) return grid_search.best_estimator_ # --- Training and Evaluation Function --- def train_evaluate_model(model_name, create_pipeline_func, tune_func, X_train, y_train, X_test, y_test): """ Train and evaluate a model using the specified pipeline and tuning function. Args: model_name (str): Name of the model (e.g., "SVM"). create_pipeline_func (callable): Function to create the model pipeline. tune_func (callable): Function to tune the model with GridSearchCV. X_train, y_train: Training data and labels. X_test, y_test: Testing data and labels. Returns: best_model: The trained and tuned model. """ print(f"\nTraining and Evaluating {model_name}...") best_model = tune_func(X_train, y_train) y_pred = best_model.predict(X_test) accuracy = accuracy_score(y_test, y_pred) f1 = safe_f1(y_test, y_pred) print(f"{model_name} Accuracy on Test Set: {accuracy:.4f}") print(f"{model_name} F1 Score on Test Set: {f1:.4f}") print(f"{model_name} Classification Report:\n{classification_report(y_test, y_pred)}") return best_model # --- Train and Evaluate Models --- svm_model = train_evaluate_model("SVM", create_svm_pipeline, tune_svm, X_train, y_train, X_test, y_test) rf_model = train_evaluate_model("Random Forest", create_rf_pipeline, tune_rf, X_train, y_train, X_test, y_test) gb_model = train_evaluate_model("Gradient Boosting", create_gb_pipeline, tune_gb, X_train, y_train, X_test, y_test)

OUTPUT:

Training and Evaluating SVM... SVM Accuracy on Test Set: 0.7500 SVM F1 Score on Test Set: 0.8421 SVM Classification Report: precision recall f1-score support 0 0.50 0.33 0.40 3 1 0.80 0.89 0.84 9 accuracy 0.75 12 macro avg 0.65 0.61 0.62 12 weighted avg 0.72 0.75 0.73 12 Training and Evaluating Random Forest... Random Forest Accuracy on Test Set: 0.7500 Random Forest F1 Score on Test Set: 0.8421 Random Forest Classification Report: precision recall f1-score support 0 0.50 0.33 0.40 3 1 0.80 0.89 0.84 9 accuracy 0.75 12 macro avg 0.65 0.61 0.62 12 weighted avg 0.72 0.75 0.73 12 Training and Evaluating Gradient Boosting... Gradient Boosting Accuracy on Test Set: 0.7500 Gradient Boosting F1 Score on Test Set: 0.8421 Gradient Boosting Classification Report: precision recall f1-score support 0 0.50 0.33 0.40 3 1 0.80 0.89 0.84 9 accuracy 0.75 12 macro avg 0.65 0.61 0.62 12 weighted avg 0.72 0.75 0.73 12

EXTENDED CODE:

import pandas as pd import numpy as np from sklearn.model_selection import train_test_split, KFold, GridSearchCV from sklearn.preprocessing import StandardScaler from sklearn.svm import SVC from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier from sklearn.metrics import accuracy_score, classification_report, make_scorer, f1_score from sklearn.pipeline import Pipeline from sklearn.impute import SimpleImputer import warnings import plotly.express as px # For dynamic plotting warnings.filterwarnings("ignore") # Raw data string (disease dataset) data = """ DOID:11054 Urinary bladder cancer 5 of 38 2.13 2.53 1.15e-06 DOID:5672 Large intestine cancer 5 of 44 2.07 2.49 1.15e-06 DOID:5093 Thoracic cancer 5 of 71 1.86 2.13 4.77e-06 DOID:9256 Colorectal cancer 4 of 32 2.11 2.1 1.36e-05 DOID:707 B-cell lymphoma 4 of 34 2.09 2.07 1.57e-05 DOID:9561 Nasopharyngeal disease 3 of 9 2.54 1.99 4.49e-05 DOID:0060119 Pharynx cancer 3 of 12 2.41 1.84 8.75e-05 DOID:0060058 Lymphoma 5 of 105 1.69 1.82 1.77e-05 DOID:3910 Lung adenocarcinoma 3 of 13 2.38 1.82 9.71e-05 DOID:4905 Pancreatic carcinoma 3 of 15 2.32 1.77 0.00012 DOID:2531 Hematologic cancer 6 of 190 1.52 1.71 1.11e-05 DOID:684 Hepatocellular carcinoma 3 of 17 2.26 1.7 0.00016 DOID:219 Colon cancer 3 of 17 2.26 1.7 0.00016 DOID:1795 Tumor of exocrine pancreas 3 of 17 2.26 1.7 0.00016 DOID:1612 Breast cancer 4 of 62 1.83 1.65 9.71e-05 DOID:3908 Lung non-small cell carcinoma 3 of 26 2.08 1.51 0.00037 DOID:11166 Obsolete papillomavirus infectious disease 2 of 2 3.02 1.5 0.00062 DOID:0050615 Respiratory system cancer 4 of 81 1.71 1.49 0.00019 DOID:170 Endocrine gland cancer 4 of 93 1.65 1.4 0.00028 DOID:0050686 Organ system cancer 10 of 757 1.14 1.37 1.15e-06 DOID:9261 Nasopharynx carcinoma 2 of 4 2.71 1.33 0.0014 DOID:1240 Leukemia 4 of 104 1.6 1.32 0.00041 DOID:8584 Burkitt lymphoma 2 of 5 2.62 1.27 0.0018 DOID:12704 Ataxia telangiectasia 2 of 5 2.62 1.27 0.0018 DOID:162 Cancer 11 of 978 1.07 1.26 1.15e-06 DOID:1909 Melanoma 3 of 46 1.83 1.22 0.0014 DOID:3498 Pancreatic ductal adenocarcinoma 2 of 6 2.54 1.22 0.0023 DOID:3012 Li-Fraumeni syndrome 2 of 6 2.54 1.22 0.0023 DOID:8557 Oropharynx cancer 2 of 7 2.47 1.18 0.0028 DOID:1037 Lymphoid leukemia 3 of 53 1.77 1.15 0.0019 DOID:2893 Cervix carcinoma 2 of 9 2.36 1.12 0.0036 DOID:3347 Osteosarcoma 2 of 10 2.32 1.09 0.0041 DOID:0060108 Brain glioma 2 of 10 2.32 1.09 0.0041 DOID:4159 Skin cancer 3 of 63 1.69 1.07 0.0028 DOID:5520 Head and neck squamous cell carcinoma 2 of 11 2.28 1.06 0.0047 DOID:8923 Skin melanoma 2 of 12 2.24 1.03 0.0054 DOID:0050621 Respiratory system benign neoplasm 2 of 12 2.24 1.03 0.0054 DOID:345 Uterine disease 3 of 72 1.64 1.01 0.0036 DOID:0050687 Cell type cancer 6 of 451 1.14 1.0 0.00041 DOID:305 Carcinoma 5 of 307 1.23 0.99 0.0010 DOID:786 Laryngeal disease 2 of 14 2.17 0.99 0.0068 DOID:3068 Glioblastoma multiforme 2 of 14 2.17 0.99 0.0068 DOID:768 Retinoblastoma 2 of 16 2.11 0.95 0.0080 DOID:4001 Ovarian carcinoma 2 of 16 2.11 0.95 0.0080 DOID:229 Female reproductive system disease 4 of 192 1.33 0.94 0.0028 DOID:0050745 Diffuse large B-cell lymphoma 2 of 17 2.09 0.93 0.0086 DOID:8618 Oral cavity cancer 2 of 18 2.06 0.92 0.0094 DOID:120 Female reproductive organ cancer 3 of 100 1.49 0.86 0.0077 DOID:77 Gastrointestinal system disease 6 of 576 1.03 0.82 0.0014 DOID:9952 Acute lymphoblastic leukemia 2 of 26 1.9 0.8 0.0165 DOID:2513 Basal cell carcinoma 2 of 27 1.89 0.78 0.0176 DOID:3459 Breast carcinoma 2 of 29 1.85 0.76 0.0195 DOID:403 Mouth disease 3 of 130 1.38 0.74 0.0137 DOID:10534 Stomach cancer 2 of 31 1.83 0.74 0.0216 DOID:289 Endometriosis 2 of 34 1.79 0.71 0.0252 DOID:0070004 Myeloid neoplasm 2 of 38 1.74 0.68 0.0294 DOID:6713 Cerebrovascular disease 2 of 46 1.65 0.61 0.0400 DOID:28 Endocrine system disease 4 of 398 1.02 0.55 0.0243 DOID:225 Syndrome 6 of 1214 0.71 0.41 0.0315 DOID:7 Disease of anatomical entity 12 of 4798 0.41 0.31 0.0185 """ # Parse data into a structured format def parse_data(data): lines = data.strip().split('\n') data_list = [] for line in lines: parts = line.split() i = len(parts) - parts[::-1].index('of') - 1 # Find last 'of' disease = ' '.join(parts[1:i-1]) successes = int(parts[i-1]) total = int(parts[i+1]) value1 = float(parts[i+2]) value2 = float(parts[i+3]) p_value = float(parts[i+4]) data_list.append([successes, total, value1, value2, p_value, disease]) return data_list # Create DataFrame data_list = parse_data(data) df = pd.DataFrame(data_list, columns=['Successes', 'Total', 'Value1', 'Value2', 'P_Value', 'Disease']) # Feature Engineering df['Accuracy'] = df['Successes'] / df['Total'] df['LogTotal'] = np.log1p(df['Total']) df['LogPValue'] = -np.log10(df['P_Value']) # Define target variable (cancer vs. non-cancer) cancer_terms = ["cancer", "carcinoma", "melanoma", "lymphoma", "leukemia", "sarcoma", "glioma", "adenocarcinoma", "retinoblastoma", "glioblastoma"] y = np.array([1 if any(term in disease.lower() for term in cancer_terms) else 0 for disease in df['Disease']]) # Features X = df[['Successes', 'Total', 'Value1', 'Value2', 'P_Value', 'Accuracy', 'LogTotal', 'LogPValue']] # Split data X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y) # Custom F1 scorer def safe_f1(y_true, y_pred): return f1_score(y_true, y_pred, average='binary', zero_division=0) custom_f1 = make_scorer(safe_f1) # Model Pipelines def create_svm_pipeline(): return Pipeline([ ('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler()), ('svm', SVC(random_state=42, class_weight='balanced')) ]) def tune_svm(X_train, y_train): param_grid = {'svm__C': [0.1, 1, 10], 'svm__kernel': ['rbf', 'linear'], 'svm__gamma': ['scale', 'auto', 0.1, 1]} grid_search = GridSearchCV(create_svm_pipeline(), param_grid, scoring=custom_f1, cv=KFold(n_splits=10, shuffle=True, random_state=42), n_jobs=-1) grid_search.fit(X_train, y_train) return grid_search.best_estimator_ def create_rf_pipeline(): return Pipeline([ ('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler()), ('rf', RandomForestClassifier(random_state=42, class_weight='balanced')) ]) def tune_rf(X_train, y_train): param_grid = {'rf__n_estimators': [50, 100, 200], 'rf__max_depth': [3, 5, 7], 'rf__min_samples_leaf': [1, 5, 10]} grid_search = GridSearchCV(create_rf_pipeline(), param_grid, scoring=custom_f1, cv=KFold(n_splits=10, shuffle=True, random_state=42), n_jobs=-1) grid_search.fit(X_train, y_train) return grid_search.best_estimator_ def create_gb_pipeline(): return Pipeline([ ('imputer', SimpleImputer(strategy='median')), ('scaler', StandardScaler()), ('gb', GradientBoostingClassifier(random_state=42)) ]) def tune_gb(X_train, y_train): param_grid = {'gb__n_estimators': [50, 100, 200], 'gb__learning_rate': [0.01, 0.1, 0.2], 'gb__max_depth': [3, 5, 7]} grid_search = GridSearchCV(create_gb_pipeline(), param_grid, scoring=custom_f1, cv=KFold(n_splits=10, shuffle=True, random_state=42), n_jobs=-1) grid_search.fit(X_train, y_train) return grid_search.best_estimator_ # Train and Evaluate Models def train_evaluate_model(model_name, create_func, tune_func, X_train, y_train, X_test, y_test): print(f"\nTraining and Evaluating {model_name}...") best_model = tune_func(X_train, y_train) y_pred = best_model.predict(X_test) accuracy = accuracy_score(y_test, y_pred) f1 = safe_f1(y_test, y_pred) print(f"{model_name} Accuracy: {accuracy:.4f}") print(f"{model_name} F1 Score: {f1:.4f}") print(f"{model_name} Classification Report:\n{classification_report(y_test, y_pred)}") return best_model, {'Model': model_name, 'Accuracy': accuracy, 'F1 Score': f1} # Train models and collect metrics models = ['SVM', 'Random Forest', 'Gradient Boosting'] create_funcs = [create_svm_pipeline, create_rf_pipeline, create_gb_pipeline] tune_funcs = [tune_svm, tune_rf, tune_gb] metrics_list = [] for model_name, create_func, tune_func in zip(models, create_funcs, tune_funcs): _, metrics = train_evaluate_model(model_name, create_func, tune_func, X_train, y_train, X_test, y_test) metrics_list.append(metrics) # Create DataFrame for plotting metrics_df = pd.DataFrame(metrics_list) # Generate Dynamic Plot fig = px.bar(metrics_df, x='Model', y=['Accuracy', 'F1 Score'], barmode='group', title='Model Performance Comparison', labels={'value': 'Score', 'variable': 'Metric'}, height=500) fig.update_layout(showlegend=True) fig.show()

OUTPUT:


Training and Evaluating Random Forest... Random Forest Accuracy: 0.7500 Random Forest F1 Score: 0.8421 Random Forest Classification Report: precision recall f1-score support 0 0.50 0.33 0.40 3 1 0.80 0.89 0.84 9 accuracy 0.75 12 macro avg 0.65 0.61 0.62 12 weighted avg 0.72 0.75 0.73 12 Training and Evaluating Gradient Boosting... Gradient Boosting Accuracy: 0.7500 Gradient Boosting F1 Score: 0.8421 Gradient Boosting Classification Report: precision recall f1-score support 0 0.50 0.33 0.40 3 1 0.80 0.89 0.84 9 accuracy 0.75 12 macro avg 0.65 0.61 0.62 12 weighted avg 0.72 0.75 0.73 12







MOLCULAR AND CELLULAR RELATED CODE

 import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, QuantileTransformer, LabelEncoder
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, make_scorer, f1_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, f_classif
import numpy as np
import warnings
warnings.filterwarnings("ignore")

# Expanded Data (Example - replace with your actual data)
data = """
GO:0097371  MDM2/MDM4 family protein binding  4 of 12 2.54  2.39  5.49e-06  TP53  Methyl Blue 1
GO:0002039  p53 binding 5 of 70 1.87  1.93  1.91e-05  TP53  Methyl Blue 0
GO:0051400  BH domain binding 3 of 10 2.49  1.78  0.00013 TP53  Methyl Blue 1
GO:0031625  Ubiquitin protein ligase binding  7 of 299  1.39  1.51  1.91e-05  TP53  Methyl Blue 0
GO:0097718  Disordered domain specific binding  3 of 34 1.96  1.13  0.0028  TP53  Methyl Blue 1
GO:0047485  Protein N-terminus binding  4 of 109  1.58  1.09  0.0020  TP53  Methyl Blue 0
GO:0019904  Protein domain specific binding 8 of 695  1.08  1.04  0.00011 TP53  Methyl Blue 0
GO:0051434  BH3 domain binding  2 of 6  2.54  0.96  0.0086  TP53  Methyl Blue 1
GO:0004861  Cyclin-dependent protein serine/threonine kinase inhibitor activity 2 of 12 2.24  0.78  0.0206  TP53  Methyl Blue 1
GO:0019899  Enzyme binding  12 of 2084  0.78  0.74  4.91e-05  TP53  Methyl Blue 0
GO:0046982  Protein heterodimerization activity 5 of 368  1.15  0.72  0.0084  TP53  Methyl Blue 0
GO:0008134  Transcription factor binding  6 of 587  1.03  0.7 0.0058  TP53  Methyl Blue 0
GO:0001228  DNA-binding transcription activator activity, RNA polymerase II-specific  5 of 458  1.05  0.59  0.0203  TP53  Methyl Blue 0
GO:0140297  DNA-binding transcription factor binding  5 of 484  1.03  0.58  0.0206  TP53  Methyl Blue 0
GO:0019900  Kinase binding  6 of 785  0.9 0.53  0.0203  TP53  Methyl Blue 0
GO:0042802  Identical protein binding 10 of 2144  0.68  0.52  0.0039  TP53  Methyl Blue 0
GO:0044877  Protein-containing complex binding  7 of 1261 0.76  0.45  0.0266  TP53  Methyl Blue 0
GO:0005515  Protein binding 18 of 7242  0.41  0.4 0.00013 TP53  Methyl Blue 0
GO:0097136  Bcl-2 family protein complex  3 of 10 2.49  1.81  0.00011 TP53  Methyl Blue 1
GO:0005741  Mitochondrial outer membrane  6 of 209  1.47  1.45  7.73e-05  TP53  Methyl Blue 0
GO:0005635  Nuclear envelope  6 of 487  1.11  0.89  0.0012  TP53  Methyl Blue 0
GO:0017053  Transcription repressor complex 3 of 77 1.61  0.83  0.0104  TP53  Methyl Blue 1
GO:0005730  Nucleolus 8 of 996  0.92  0.79  0.00059 TP53  Methyl Blue 0
GO:0031967  Organelle envelope  9 of 1262 0.87  0.76  0.00039 TP53  Methyl Blue 0
GO:0046930  Pore complex  2 of 27 1.89  0.65  0.0360  TP53  Methyl Blue 1
GO:0031965  Nuclear membrane  4 of 305  1.13  0.59  0.0244  TP53  Methyl Blue 0
GO:0005739  Mitochondrion 9 of 1681 0.74  0.58  0.0023  TP53  Methyl Blue 0
GO:0005783  Endoplasmic reticulum 8 of 2021 0.61  0.37  0.0356  TP53  Methyl Blue 0
GO:0005654  Nucleoplasm 12 of 4169  0.47  0.35  0.0124  TP53  Methyl Blue 0
GO:0032991  Protein-containing complex  14 of 5506  0.42  0.33  0.0095  TP53  Methyl Blue 0
GO:0005634  Nucleus 16 of 7672  0.33  0.29  0.0110  TP53  Methyl Blue 0
GO:0005829  Cytosol 13 of 5438  0.39  0.29  0.0287  TP53  Methyl Blue 0
GO:0043227  Membrane-bounded organelle  19 of 13188 0.17  0.2 0.0479  TP53  Methyl Blue 0
BTO:0002254 Vas efferens  3 of 5  2.79  1.85  0.00010 TP53  Methyl Blue 1
BTO:0001967 Cervical cancer cell line 3 of 10 2.49  1.78  0.00013 TP53  Methyl Blue 1
BTO:0000407 Osteosarcoma cell line  3 of 16 2.29  1.62  0.00025 TP53  Methyl Blue 1
BTO:0001913 Colonic adenocarcinoma cell line  3 of 27 2.06  1.39  0.00073 TP53  Methyl Blue 1
BTO:0006256 Solid cancer cell 2 of 2  3.02  1.38  0.0011  TP53  Methyl Blue 1
BTO:0005257 Oropharynx  2 of 2  3.02  1.38  0.0011  TP53  Methyl Blue 1
BTO:0000458 WI-38 cell  2 of 2  3.02  1.38  0.0011  TP53  Methyl Blue 1
BTO:0001938 U2-OS cell  2 of 5  2.62  1.26  0.0019  TP53  Methyl Blue 1
BTO:0001332 DU-145 cell 2 of 5  2.62  1.26  0.0019  TP53  Methyl Blue 1
BTO:0000182 HT-29 cell  2 of 5  2.62  1.26  0.0019  TP53  Methyl Blue 1
BTO:0003080 Pleural fluid 2 of 6  2.54  1.22  0.0023  TP53  Methyl Blue 1
BTO:0000567 HeLa cell 2 of 6  2.54  1.22  0.0023  TP53  Methyl Blue 1
BTO:0000583 Bone marrow cancer cell 7 of 442  1.22  1.19  0.00010 TP53  Methyl Blue 0
BTO:0004725 Embryonic fibroblast  2 of 7  2.47  1.19  0.0026  TP53  Methyl Blue 1
BTO:0001615 Colorectal cancer cell  2 of 7  2.47  1.19  0.0026  TP53  Methyl Blue 1
BTO:0001130 Prostate gland cancer cell  2 of 7  2.47  1.19  0.0026  TP53  Methyl Blue 1
BTO:0000599 Hep-G2 cell 2 of 7  2.47  1.19  0.0026  TP53  Methyl Blue 1
BTO:0000018 A-549 cell  2 of 7  2.47  1.19  0.0026  TP53  Methyl Blue 1
BTO:0000773 Lymphoblastoid cell line  3 of 49 1.8 1.17  0.0018  TP53  Methyl Blue 1
BTO:0000093 MCF-7 cell  2 of 11 2.28  1.08  0.0044  TP53  Methyl Blue 1
BTO:0000669 Embryonic cell line 3 of 73 1.63  1.02  0.0035  TP53  Methyl Blue 1
BTO:0001049 Pharynx 3 of 75 1.62  1.0 0.0037  TP53  Methyl Blue 1
BTO:0004254 Cancer stem cell  2 of 16 2.11  0.96  0.0077  TP53  Methyl Blue 1
BTO:0000353 Lung cell line  3 of 85 1.56  0.94  0.0049  TP53  Methyl Blue 1
BTO:0000580 Blood cancer cell 10 of 1234  0.92  0.88  0.00010 TP53  Methyl Blue 0
BTO:0001271 Leukemia cell 9 of 1067 0.94  0.88  0.00013 TP53  Methyl Blue 0
BTO:0000452 Fibroblast  4 of 224  1.27  0.86  0.0039  TP53  Methyl Blue 0
BTO:0000737 Leukemia cell line  3 of 108  1.46  0.83  0.0085  TP53  Methyl Blue 1
BTO:0000426 Erythroleukemia cell  4 of 244  1.23  0.82  0.0049  TP53  Methyl Blue 0
BTO:0001340 Bronchus  2 of 25 1.92  0.82  0.0148  TP53  Methyl Blue 1
BTO:0002144 Acute lymphoblastic leukemia cell line  2 of 34 1.79  0.71  0.0244  TP53  Methyl Blue 1
BTO:0000740 Myeloid leukemia cell line  2 of 36 1.76  0.7 0.0253  TP53  Methyl Blue 1
BTO:0000180 Cervical carcinoma cell 4 of 322  1.11  0.67  0.0118  TP53  Methyl Blue 0
BTO:0000744 Lymphocytic leukemia cell 4 of 419  1.0 0.54  0.0253  TP53  Methyl Blue 0
BTO:0001541 Pronephros  3 of 219  1.15  0.52  0.0432  TP53  Methyl Blue 0
BTO:0001546 Chronic lymphocytic leukemia cell 3 of 222  1.15  0.52  0.0443  TP53  Methyl Blue 0
BTO:0000284 Organism form 10 of 2542  0.61  0.48  0.0028  TP53  Methyl Blue 0
BTO:0001253 Skin  6 of 1151 0.73  0.44  0.0249  TP53  Methyl Blue 0
BTO:0000379 Embryo  5 of 824  0.8 0.44  0.0339  TP53  Methyl Blue 0
BTO:0000174 Embryonic structure 9 of 2369 0.6 0.43  0.0085  TP53  Methyl Blue 0
BTO:0003099 Internal female genital organ 9 of 2804 0.52  0.35  0.0249  TP53  Methyl Blue 0
BTO:0000083 Female reproductive system  13 of 6111  0.34  0.27  0.0332  TP53  Methyl Blue 0
"""

def parse_data(data):
    lines = data.strip().split('\n')
    data_list = []
    for line in lines:
        parts = line.split('\t')
        successes = int(parts[2].split(' of ')[0])
        total = int(parts[2].split(' of ')[1])
        value1 = float(parts[3])
        value2 = float(parts[4])
        p_value = float(parts[5])
        protein = parts[6]
        drug = parts[7]
        target = int(parts[8])  # Directly use provided target
        biomaterial = parts[0]  # Capture biomaterial ID
        data_list.append([successes, total, value1, value2, p_value, protein, drug, target, biomaterial]) #Added biomaterial
    return data_list

data_list = parse_data(data)
df = pd.DataFrame(data_list, columns=['Successes', 'Total', 'Value1', 'Value2', 'P_Value', 'Protein', 'Drug', 'Target', 'Biomaterial']) #Added Biomaterial column

# Data Preprocessing - Encoding Categorical Features
label_encoder = LabelEncoder()
df['Protein'] = label_encoder.fit_transform(df['Protein'])
df['Drug'] = label_encoder.fit_transform(df['Drug'])
df['Biomaterial'] = label_encoder.fit_transform(df['Biomaterial']) #Encoded Biomaterial

# Feature Engineering
df['Accuracy'] = df['Successes'] / df['Total']
df['LogTotal'] = np.log1p(df['Total'])
df['LogPValue'] = -np.log10(df['P_Value'])
df['Value1_x_Value2'] = df['Value1'] * df['Value2']
df['Success_Ratio'] = df['Successes'] / (df['Total'] + 1e-6)
df['Value1_Div_Value2'] = df['Value1'] / (df['Value2'] + 1e-6)
df['Combined_Value'] = df['Value1'] + df['Value2'] + df['LogTotal']

# Define features and target
X = df[['Successes', 'Total', 'Value1', 'Value2', 'P_Value', 'Protein', 'Drug', 'Biomaterial', 'Accuracy', 'LogTotal', 'LogPValue', 'Value1_x_Value2', 'Success_Ratio', 'Value1_Div_Value2','Combined_Value']] #Added Biomaterial
y = df['Target']

# Data Splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Custom F1 Scorer
def safe_f1(y_true, y_pred):
    f1 = f1_score(y_true, y_pred, average='binary', zero_division=0)
    return f1
custom_f1 = make_scorer(safe_f1)

# Pipelines
def create_pipeline(model, select_features=False, k=10, l2_penalty=0.0):  # Added L2 regularization option
    steps = [
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
        ('transformer', QuantileTransformer(output_distribution='normal'))
    ]
    if select_features:
        steps.append(('feature_selection', SelectKBest(score_func=f_classif, k=k)))

    # Add L2 regularization to Logistic Regression (if applicable)
    if isinstance(model, LogisticRegression):
        model.penalty = 'l2' #Apply L2
        model.C = 1.0 / (l2_penalty + 1e-9)  #Convert L2 to C parameter

    steps.append(('model', model))
    pipeline = Pipeline(steps)
    return pipeline

# Models
svm_model = SVC(kernel='rbf', C=0.1, gamma='scale', random_state=42) #Simplified SVM
rf_model = RandomForestClassifier(n_estimators=50, max_depth=5, min_samples_leaf=5, random_state=42, class_weight='balanced') #Simplified RF
gb_model = GradientBoostingClassifier(n_estimators=50, learning_rate=0.1, max_depth=3, random_state=42) #Simplified GB
nn_model = MLPClassifier(hidden_layer_sizes=(32,), activation='relu', solver='adam', random_state=42, max_iter=200, early_stopping=True, alpha=0.01) #Simplified NN with L2

# Create Pipelines WITH feature selection.  Tune the 'k' value!
svm_pipeline = create_pipeline(svm_model, select_features=True, k=7)
rf_pipeline = create_pipeline(rf_model, select_features=True, k=7)
gb_pipeline = create_pipeline(gb_model, select_features=True, k=7)
nn_pipeline = create_pipeline(nn_model, select_features=True, k=7)

# Training and Evaluation
def train_evaluate_model(pipeline, model_name, X_train, y_train, X_test, y_test, cv=10): # Increased CV folds
    print(f"Training and Evaluating {model_name}...")

    # Cross-validation
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=KFold(n_splits=cv, shuffle=True, random_state=42), scoring=custom_f1)
    print(f"{model_name} Cross-Validation F1 Scores: {cv_scores}")
    print(f"{model_name} Mean Cross-Validation F1 Score: {cv_scores.mean():.4f}")

    # Train on full training data
    pipeline.fit(X_train, y_train)

    # Predictions and Evaluation
    y_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = safe_f1(y_test, y_pred)
    print(f"{model_name} Accuracy on Test Set: {accuracy:.4f}")
    print(f"{model_name} F1 Score on Test Set: {f1:.4f}")
    print(f"{model_name} Classification Report:\n{classification_report(y_test, y_pred)}")

    return pipeline

# Train and Evaluate
svm_pipeline = train_evaluate_model(svm_pipeline, "SVM", X_train, y_train, X_test, y_test)
rf_pipeline = train_evaluate_model(rf_pipeline, "Random Forest", X_train, y_train, X_test, y_test)
gb_pipeline = train_evaluate_model(gb_pipeline, "Gradient Boosting", X_train, y_train, X_test, y_test)
nn_pipeline = train_evaluate_model(nn_pipeline, "Neural Network", X_train, y_train, X_test, y_test)

# --- Feature Importance (Random Forest) ---
print("\nFeature Importance (Random Forest - Trained on ALL data):") #Use RF for Feature Importance
rf_pipeline.fit(X, y) #Fit on all Data for final training

if hasattr(rf_pipeline.named_steps['model'], 'feature_importances_'):
    importance = rf_pipeline.named_steps['model'].feature_importances_
    feature_names = X.columns

    for i, v in enumerate(importance):
        print(f"Feature: {feature_names[i]}, Score: {v:.4f}")
else:
    print("Random Forest Model does not have feature_importances_ attribute.")

Protein-Protein interactions code

 from sklearn.model_selection import GridSearchCV

from sklearn.linear_model import LinearRegression

from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error

from sklearn.model_selection import train_test_split

from sklearn.preprocessing import OneHotEncoder

import pandas as pd

import numpy as np

import io

from sklearn.compose import ColumnTransformer

from sklearn.pipeline import Pipeline

from sklearn.svm import SVR

from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor


# Load dataset from the provided string

data = """node1 node2 node1 accession node2 accession node1 annotation node2 annotation score

ATM BRCA1 ENSP00000278616 ENSP00000418960 ataxia telangiectasia mutated; Serine/threonine protein kinase which activates checkpoint signaling upon double strand breaks (DSBs), apoptosis and genotoxic stresses such as ionizing ultraviolet A light (UVA), thereby acting as a DNA damage sensor. Recognizes the substrate consensus sequence [ST]-Q. Phosphorylates ’Ser-139’ of histone variant H2AX/H2AFX at double strand breaks (DSBs), thereby regulating DNA damage response mechanism. Also plays a role in pre-B cell allelic exclusion, a process leading to expression of a single immunoglobulin heavy chain allele to enforce clonality and [...] breast cancer 1, early onset; E3 ubiquitin-protein ligase that specifically mediates the formation of ’Lys-6’-linked polyubiquitin chains and plays a central role in DNA repair by facilitating cellular responses to DNA damage. It is unclear whether it also mediates the formation of other types of polyubiquitin chains. The E3 ubiquitin-protein ligase activity is required for its tumor suppressor function. The BRCA1-BARD1 heterodimer coordinates a diverse range of cellular pathways such as DNA damage repair, ubiquitination and transcriptional regulation to maintain genomic stability. Reg [...] 0.999

ATM CDKN1A ENSP00000278616 ENSP00000244741 ataxia telangiectasia mutated; Serine/threonine protein kinase which activates checkpoint signaling upon double strand breaks (DSBs), apoptosis and genotoxic stresses such as ionizing ultraviolet A light (UVA), thereby acting as a DNA damage sensor. Recognizes the substrate consensus sequence [ST]-Q. Phosphorylates ’Ser-139’ of histone variant H2AX/H2AFX at double strand breaks (DSBs), thereby regulating DNA damage response mechanism. Also plays a role in pre-B cell allelic exclusion, a process leading to expression of a single immunoglobulin heavy chain allele to enforce clonality and [...] cyclin-dependent kinase inhibitor 1A (p21, Cip1); May be the important intermediate by which p53/TP53 mediates its role as an inhibitor of cellular proliferation in response to DNA damage. Binds to and inhibits cyclin-dependent kinase activity, preventing phosphorylation of critical cyclin- dependent kinase substrates and blocking cell cycle progression. Functions in the nuclear localization and assembly of cyclin D- CDK4 complex and promotes its kinase activity towards RB1. At higher stoichiometric ratios, inhibits the kinase activity of the cyclin D-CDK4 complex 0.974

ATM CDKN2A ENSP00000278616 ENSP00000394932 ataxia telangiectasia mutated; Serine/threonine protein kinase which activates checkpoint signaling upon double strand breaks (DSBs), apoptosis and genotoxic stresses such as ionizing ultraviolet A light (UVA), thereby acting as a DNA damage sensor. Recognizes the substrate consensus sequence [ST]-Q. Phosphorylates ’Ser-139’ of histone variant H2AX/H2AFX at double strand breaks (DSBs), thereby regulating DNA damage response mechanism. Also plays a role in pre-B cell allelic exclusion, a process leading to expression of a single immunoglobulin heavy chain allele to enforce clonality and [...] cyclin-dependent kinase inhibitor 2A; Acts as a negative regulator of the proliferation of normal cells by interacting strongly with CDK4 and CDK6. This inhibits their ability to interact with cyclins D and to phosphorylate the retinoblastoma protein 0.710

ATM CREBBP ENSP00000278616 ENSP00000262367 ataxia telangiectasia mutated; Serine/threonine protein kinase which activates checkpoint signaling upon double strand breaks (DSBs), apoptosis and genotoxic stresses such as ionizing ultraviolet A light (UVA), thereby acting as a DNA damage sensor. Recognizes the substrate consensus sequence [ST]-Q. Phosphorylates ’Ser-139’ of histone variant H2AX/H2AFX at double strand breaks (DSBs), thereby regulating DNA damage response mechanism. Also plays a role in pre-B cell allelic exclusion, a process leading to expression of a single immunoglobulin heavy chain allele to enforce clonality and [...] CREB binding protein; Acetylates histones, giving a specific tag for transcriptional activation. Also acetylates non-histone proteins, like NCOA3 and FOXO1. Binds specifically to phosphorylated CREB and enhances its transcriptional activity toward cAMP-responsive genes. Acts as a coactivator of ALX1 in the presence of EP300 0.449

ATM EP300 ENSP00000278616 ENSP00000263253 ataxia telangiectasia mutated; Serine/threonine protein kinase which activates checkpoint signaling upon double strand breaks (DSBs), apoptosis and genotoxic stresses such as ionizing ultraviolet A light (UVA), thereby acting as a DNA damage sensor. Recognizes the substrate consensus sequence [ST]-Q. Phosphorylates ’Ser-139’ of histone variant H2AX/H2AFX at double strand breaks (DSBs), thereby regulating DNA damage response mechanism. Also plays a role in pre-B cell allelic exclusion, a process leading to expression of a single immunoglobulin heavy chain allele to enforce clonality and [...] E1A binding protein p300; Functions as histone acetyltransferase and regulates transcription via chromatin remodeling. Acetylates all four core histones in nucleosomes. Histone acetylation gives an epigenetic tag for transcriptional activation. Mediates cAMP-gene regulation by binding specifically to phosphorylated CREB protein. Also functions as acetyltransferase for nonhistone targets. Acetylates ’Lys-131’ of ALX1 and acts as its coactivator in the presence of CREBBP. Acetylates SIRT2 and is proposed to indirectly increase the transcriptional activity of TP53 through acetylation and [...] 0.624

ATM KAT2B ENSP00000278616 ENSP00000263754 ataxia telangiectasia mutated; Serine/threonine protein kinase which activates checkpoint signaling upon double strand breaks (DSBs), apoptosis and genotoxic stresses such as ionizing ultraviolet A light (UVA), thereby acting as a DNA damage sensor. Recognizes the substrate consensus sequence [ST]-Q. Phosphorylates ’Ser-139’ of histone variant H2AX/H2AFX at double strand breaks (DSBs), thereby regulating DNA damage response mechanism. Also plays a role in pre-B cell allelic exclusion, a process leading to expression of a single immunoglobulin heavy chain allele to enforce clonality and [...] K(lysine) acetyltransferase 2B; Functions as a histone acetyltransferase (HAT) to promote transcriptional activation. Has significant histone acetyltransferase activity with core histones (H3 and H4), and also with nucleosome core particles. Inhibits cell-cycle progression and counteracts the mitogenic activity of the adenoviral oncoprotein E1A. In case of HIV-1 infection, it is recruited by the viral protein Tat. Regulates Tat’s transactivating activity and may help inducing chromatin remodeling of proviral genes 0.582

ATM MAPK8 ENSP00000278616 ENSP00000353483 ataxia telangiectasia mutated; Serine/threonine protein kinase involved in various processes such as cell proliferation, differentiation, migration, transformation and programmed cell death. Extracellular stimuli such as proinflammatory cytokines or physical stress stimulate the stress-activated protein kinase/c-Jun N-terminal kinase (SAP/JNK) signaling pathway. In this cascade, two dual specificity kinases MAP2K4/MKK4 and MAP2K7/MKK7 phosphorylate and activate MAPK8/JNK1. In turn, MAPK8/JNK1 phosphorylates a number of transcription factors, primarily components of AP-1 such as JU [...] mitogen-activated protein kinase 8; Serine/threonine-protein kinase involved in various processes such as cell proliferation, differentiation, migration, transformation and programmed cell death. Extracellular stimuli such as proinflammatory cytokines or physical stress stimulate the stress-activated protein kinase/c-Jun N-terminal kinase (SAP/JNK) signaling pathway. In this cascade, two dual specificity kinases MAP2K4/MKK4 and MAP2K7/MKK7 phosphorylate and activate MAPK8/JNK1. In turn, MAPK8/JNK1 phosphorylates a number of transcription factors, primarily components of AP-1 such as JU [...] 0.449

ATM MDM2 ENSP00000278616 ENSP00000417281 ataxia telangiectasia mutated; Serine/threonine protein kinase which activates checkpoint signaling upon double strand breaks (DSBs), apoptosis and genotoxic stresses such as ionizing ultraviolet A light (UVA), thereby acting as a DNA damage sensor. Recognizes the substrate consensus sequence [ST]-Q. Phosphorylates ’Ser-139’ of histone variant H2AX/H2AFX at double strand breaks (DSBs), thereby regulating DNA damage response mechanism. Also plays a role in pre-B cell allelic exclusion, a process leading to expression of a single immunoglobulin heavy chain allele to enforce clonality and [...] Mdm2, p53 E3 ubiquitin protein ligase homolog (mouse) 0.998

ATM SIRT1 ENSP00000278616 ENSP00000212015 ataxia telangiectasia mutated; Serine/threonine protein kinase which activates checkpoint signaling upon double strand breaks (DSBs), apoptosis and genotoxic stresses such as ionizing ultraviolet A light (UVA), thereby acting as a DNA damage sensor. Recognizes the substrate consensus sequence [ST]-Q. Phosphorylates ’Ser-139’ of histone variant H2AX/H2AFX at double strand breaks (DSBs), thereby regulating DNA damage response mechanism. Also plays a role in pre-B cell allelic exclusion, a process leading to expression of a single immunoglobulin heavy chain allele to enforce clonality and [...] sirtuin 1; NAD-dependent protein deacetylase that links transcriptional regulation directly to intracellular energetics and participates in the coordination of several separated cellular functions such as cell cycle, response to DNA damage, metobolism, apoptosis and autophagy. Can modulate chromatin function through deacetylation of histones and can promote alterations in the methylation of histones and DNA, leading to transcriptional repression. Deacetylates a broad range of transcription factors and coregulators, thereby regulating target gene expression positively and negatively. Se [...] 0.946

ATM TP53 ENSP00000278616 ENSP00000269305 ataxia telangiectasia mutated; Serine/threonine protein kinase which activates checkpoint signaling upon double strand breaks (DSBs), apoptosis and genotoxic stresses such as ionizing ultraviolet A light (UVA), thereby acting as a DNA damage sensor. Recognizes the substrate consensus sequence [ST]-Q. Phosphorylates ’Ser-139’ of histone variant H2AX/H2AFX at double strand breaks (DSBs), thereby regulating DNA damage response mechanism. Also plays a role in pre-B cell allelic exclusion, a process leading to expression of a single immunoglobulin heavy chain allele to enforce clonality and [...] tumor protein p53; Acts as a tumor suppressor in many tumor types; induces growth arrest or apoptosis depending on the physiological circumstances and cell type. Involved in cell cycle regulation as a trans-activator that acts to negatively regulate cell division by controlling a set of genes required for this process. One of the activated genes is an inhibitor of cyclin-dependent kinases. Apoptosis induction seems to be mediated either by stimulation of BAX and FAS antigen expression, or by repression of Bcl-2 expression (By similarity) 0.999

BRCA1 ATM ENSP00000418960 ENSP00000278616 breast cancer 1, early onset; E3 ubiquitin-protein ligase that specifically mediates the formation of ’Lys-6’-linked polyubiquitin chains and plays a central role in DNA repair by facilitating cellular responses to DNA damage. It is unclear whether it also mediates the formation of other types of polyubiquitin chains. The E3 ubiquitin-protein ligase activity is required for its tumor suppressor function. The BRCA1-BARD1 heterodimer coordinates a diverse range of cellular pathways such as DNA damage repair, ubiquitination and transcriptional regulation to maintain genomic stability. Reg [...] ataxia telangiectasia mutated; Serine/threonine protein kinase which activates checkpoint signaling upon double strand breaks (DSBs), apoptosis and genotoxic stresses such as ionizing ultraviolet A light (UVA), thereby acting as a DNA damage sensor. Recognizes the substrate consensus sequence [ST]-Q. Phosphorylates ’Ser-139’ of histone variant H2AX/H2AFX at double strand breaks (DSBs), thereby regulating DNA damage response mechanism. Also plays a role in pre-B cell allelic exclusion, a process leading to expression of a single immunoglobulin heavy chain allele to enforce clonality and [...] 0.999

BRCA1 CDKN1A ENSP00000418960 ENSP00000244741 breast cancer 1, early onset; E3 ubiquitin-protein ligase that specifically mediates the formation of ’Lys-6’-linked polyubiquitin chains and plays a central role in DNA repair by facilitating cellular responses to DNA damage. It is unclear whether it also mediates the formation of other types of polyubiquitin chains. The E3 ubiquitin-protein ligase activity is required for its tumor suppressor function. The BRCA1-BARD1 heterodimer coordinates a diverse range of cellular pathways such as DNA damage repair, ubiquitination and transcriptional regulation to maintain genomic stability. Reg [...] cyclin-dependent kinase inhibitor 1A (p21, Cip1); May be the important intermediate by which p53/TP53 mediates its role as an inhibitor of cellular proliferation in response to DNA damage. Binds to and inhibits cyclin-dependent kinase activity, preventing phosphorylation of critical cyclin- dependent kinase substrates and blocking cell cycle progression. Functions in the nuclear localization and assembly of cyclin D- CDK4 complex and promotes its kinase activity towards RB1. At higher stoichiometric ratios, inhibits the kinase activity of the cyclin D-CDK4 complex 0.834

BRCA1 CDKN2A ENSP00000418960 ENSP00000394932 breast cancer 1, early onset; E3 ubiquitin-protein ligase that specifically mediates the formation of ’Lys-6’-linked polyubiquitin chains and plays a central role in DNA repair by facilitating cellular responses to DNA damage. It is unclear whether it also mediates the formation of other types of polyubiquitin chains. The E3 ubiquitin-protein ligase activity is required for its tumor suppressor function. The BRCA1-BARD1 heterodimer coordinates a diverse range of cellular pathways such as DNA damage repair, ubiquitination and transcriptional regulation to maintain genomic stability. Reg [...] cyclin-dependent kinase inhibitor 2A; Acts as a negative regulator of the proliferation of normal cells by interacting strongly with CDK4 and CDK6. This inhibits their ability to interact with cyclins D and to phosphorylate the retinoblastoma protein 0.876

BRCA1 CREBBP ENSP00000418960 ENSP00000262367 breast cancer 1, early onset; E3 ubiquitin-protein ligase that specifically mediates the formation of ’Lys-6’-linked polyubiquitin chains and plays a central role in DNA repair by facilitating cellular responses to DNA damage. It is unclear whether it also mediates the formation of other types of polyubiquitin chains. The E3 ubiquitin-protein ligase activity is required for its tumor suppressor function. The BRCA1-BARD1 heterodimer coordinates a diverse range of cellular pathways such as DNA damage repair, ubiquitination and transcriptional regulation to maintain genomic stability. Reg [...] CREB binding protein; Acetylates histones, giving a specific tag for transcriptional activation. Also acetylates histones, like NCOA3 and FOXO1. Binds specifically to phosphorylated CREB and enhances its transcriptional activity toward cAMP-responsive genes. Acts as a coactivator of ALX1 in the presence of EP300 0.997

BRCA1 EP300 ENSP00000418960 ENSP00000263253 breast cancer 1, early onset; E3 ubiquitin-protein ligase that specifically mediates the formation of ’Lys-6’-linked polyubiquitin chains and plays a central role in DNA repair by facilitating cellular responses to DNA damage. It is unclear whether it also mediates the formation of other types of polyubiquitin chains. The E3 ubiquitin-protein ligase activity is required for its tumor suppressor function. The BRCA1-BARD1 heterodimer coordinates a diverse range of cellular pathways such as DNA damage repair, ubiquitination and transcriptional regulation to maintain genomic stability. Reg [...] E1A binding protein p300; Functions as histone acetyltransferase and regulates transcription via chromatin remodeling. Acetylates all four core histones in nucleosomes. Histone acetylation gives an epigenetic tag for transcriptional activation. Mediates cAMP-gene regulation by binding specifically to phosphorylated CREB protein. Also functions as acetyltransferase for nonhistone targets. Acetylates ’Lys-131’ of ALX1 and acts as its coactivator in the presence of CREBBP. Acetylates SIRT2 and is proposed to indirectly increase the transcriptional activity of TP53 through acetylation and [...] 0.992

BRCA1 KAT2B ENSP00000418960 ENSP00000263754 breast cancer 1, early onset; E3 ubiquitin-protein ligase that specifically mediates the formation of ’Lys-6’-linked polyubiquitin chains and plays a central role in DNA repair by facilitating cellular responses to DNA damage. It is unclear whether it also mediates the formation of other types of polyubiquitin chains. The E3 ubiquitin-protein ligase activity is required for its tumor suppressor function. The BRCA1-BARD1 heterodimer coordinates a diverse range of cellular pathways such as DNA damage repair, ubiquitination and transcriptional regulation to maintain genomic stability. Reg [...] K(lysine) acetyltransferase 2B; Functions as a histone acetyltransferase (HAT) to promote transcriptional activation. Has significant histone acetyltransferase activity with core histones (H3 and H4), and also with nucleosome core particles. Inhibits cell-cycle progression and counteracts the mitogenic activity of the adenoviral oncoprotein E1A. In case of HIV-1 infection, it is recruited by the viral protein Tat. Regulates Tat’s transactivating activity and may help inducing chromatin remodeling of proviral genes 0.420

BRCA1 MDM2 ENSP00000418960 ENSP00000417281 breast cancer 1, early onset; E3 ubiquitin-protein ligase that specifically mediates the formation of ’Lys-6’-linked polyubiquitin chains and plays a central role in DNA repair by facilitating cellular responses to DNA damage. It is unclear whether it also mediates the formation of other types of polyubiquitin chains. The E3 ubiquitin-protein ligase activity is required for its tumor suppressor function. The BRCA1-BARD1 heterodimer coordinates a diverse range of cellular pathways such as DNA damage repair, ubiquitination and transcriptional regulation to maintain genomic stability. Reg [...] Mdm2, p53 E3 ubiquitin protein ligase homolog (mouse) 0.652

BRCA1 SIRT1 ENSP00000418960 ENSP00000212015 breast cancer 1, early onset; E3 ubiquitin-protein ligase that specifically mediates the formation of ’Lys-6’-linked polyubiquitin chains and plays a central role in DNA repair by facilitating cellular responses to DNA damage. It is unclear whether it also mediates the formation of other types of polyubiquitin chains. The E3 ubiquitin-protein ligase activity is required for its tumor suppressor function. The BRCA1-BARD1 heterodimer coordinates a diverse range of cellular pathways such as DNA damage repair, ubiquitination and transcriptional regulation to maintain genomic stability. Reg [...] sirtuin 1; NAD-dependent protein deacetylase that links transcriptional regulation directly to intracellular energetics and participates in the coordination of several separated cellular functions such as cell cycle, response to DNA damage, metobolism, apoptosis and autophagy. Can modulate chromatin function through deacetylation of histones and can promote alterations in the methylation of histones and DNA, leading to transcriptional repression. Deacetylates a broad range of transcription factors and coregulators, thereby regulating target gene expression positively and negatively. Se [...] 0.793

BRCA1 TP53 ENSP00000418960 ENSP00000269305 breast cancer 1, early onset; E3 ubiquitin-protein ligase that specifically mediates the formation of ’Lys-6’-linked polyubiquitin chains and plays a central role in DNA repair by facilitating cellular responses to DNA damage. It is unclear whether it also mediates the formation of other types of polyubiquitin chains. The E3 ubiquitin-protein ligase activity is required for its tumor suppressor function. The BRCA1-BARD1 heterodimer coordinates a diverse range of cellular pathways such as DNA damage repair, ubiquitination and transcriptional regulation to maintain genomic stability. Reg [...] tumor protein p53; Acts as a tumor suppressor in many tumor types; induces growth arrest or apoptosis depending on the physiological circumstances and cell type. Involved in cell cycle regulation as a trans-activator that acts to negatively regulate cell division by controlling a set of genes required for this process. One of the activated genes is an inhibitor of cyclin-dependent kinases. Apoptosis induction seems to be mediated either by stimulation of BAX and FAS antigen expression, or by repression of Bcl-2 expression (By similarity) 0.999"""


# Load dataset

try:

    df = pd.read_csv(io.StringIO(data), sep='\t')

except Exception as e:

    print(f"Error loading data: {e}")

    df = pd.DataFrame()


if not df.empty:

    # Convert 'score' column to float

    df['score'] = pd.to_numeric(df['score'], errors='coerce')

    df = df.dropna(subset=['score'])


    # Define categorical and numerical columns

    categorical_cols = ['node1', 'node2']


    # Create a preprocessor

    preprocessor = ColumnTransformer(

        transformers=[

            ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_cols)],

        remainder='passthrough')


    # Split data

    X = df[['node1', 'node2']]  # Only include 'node1' and 'node2' columns

    y = df['score']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


    # Models

    models = {

        "SVR": SVR(),

        "RandomForestRegressor": RandomForestRegressor(random_state=42),

        "GradientBoostingRegressor": GradientBoostingRegressor(random_state=42)

    }


    for model_name, model in models.items():

        print(f"Evaluating {model_name}:")


        pipeline = Pipeline([

            ('preprocessor', preprocessor),

            ('model', model)

        ])


        # Parameter grid for GridSearchCV

        if model_name == "SVR":

            param_grid = {

                'model__C': [0.1, 1, 10],

                'model__kernel': ['rbf', 'linear']

            }

        elif model_name == "RandomForestRegressor":

            param_grid = {

                'model__n_estimators': [100, 200, 300],

                'model__max_depth': [3, 5, 7]

            }

        elif model_name == "GradientBoostingRegressor":

            param_grid = {

                'model__n_estimators': [100, 200, 300],

                'model__learning_rate': [0.01, 0.1, 0.2],

                'model__max_depth': [3, 4, 5]

            }

        else:

            param_grid = {}


        # Apply GridSearchCV if parameters available

        if param_grid:

            grid_search = GridSearchCV(pipeline, param_grid, scoring='neg_mean_squared_error', cv=5)

            grid_search.fit(X_train, y_train)

            best_model = grid_search.best_estimator_

            y_pred = best_model.predict(X_test)


            # Print best parameters

            print(f"  Best parameters: {grid_search.best_params_}")


        else:

            # Train the model

            pipeline.fit(X_train, y_train)


            # Make predictions

            y_pred = pipeline.predict(X_test)


        # Calculate metrics

        rmse = np.sqrt(mean_squared_error(y_test, y_pred))

        mse = mean_squared_error(y_test, y_pred)

        mae = mean_absolute_error(y_test, y_pred)

        r2 = r2_score(y_test, y_pred)


        print(f"  RMSE: {rmse}")

        print(f"  MSE: {mse}")

        print(f"  MAE: {mae}")

        print(f"  R^2: {r2}")

        print("-" * 30)


else:

    print("DataFrame is empty. Check data loading process.")


OUTPUT:


Evaluating SVR: Best parameters: {'model__C': 0.1, 'model__kernel': 'rbf'} RMSE: 0.1683784486997364 MSE: 0.028351301986529756 MAE: 0.1524623464882887 R^2: -0.03446656902734979 ------------------------------ Evaluating RandomForestRegressor: Best parameters: {'model__max_depth': 3, 'model__n_estimators': 300} RMSE: 0.12814572274025987 MSE: 0.016421326256623556 MAE: 0.10554134060846573 R^2: 0.40082776305514656 ------------------------------ Evaluating GradientBoostingRegressor: Best parameters: {'model__learning_rate': 0.01, 'model__max_depth': 5, 'model__n_estimators': 100} RMSE: 0.1435502767434981 MSE: 0.020606681953134892 MAE: 0.11404125169586254 R^2: 0.24811482769908288


CODE:

import networkx as nx

import matplotlib.pyplot as plt


# Sample PPI data (replace with actual STRING data)

ppi_data = {

'TP53': ['MDM2', 'CDKN1A'], # Example interactions

'Curcumin': ['NFKB1', 'COX2'], # Example interactions

'Methylene Blue': ['CYP1A2', 'NOS3'] #Example interactions

}


# Create a graph

G = nx.Graph()


# Add nodes and edges

for protein, interactions in ppi_data.items():

G.add_node(protein)

for interaction in interactions:

G.add_edge(protein, interaction)


# Visualize the graph

plt.figure(figsize=(10, 8))

nx.draw(G, with_labels=True, node_color='skyblue', edge_color='gray')

plt.title("Protein-Protein Interactions related to Curcumin, Methylene Blue and TP53")

plt.show()





Protein-Protein interactions code © 2025 by Devise Foundation is licensed under CC BY-NC-ND 4.0 

Reviving Life and Redefining Medicine: The ConsciousLeaf Vision

  Date : April 08, 2025 Author : Mrinmoy Chakraborty, Chairman of Devise Foundation, in collaboration with Grok 3 (xAI) Introduction At Devi...