import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, QuantileTransformer, LabelEncoder
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, make_scorer, f1_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, f_classif
import numpy as np
import warnings
warnings.filterwarnings("ignore")
# Expanded Data (Example - replace with your actual data)
data = """
GO:0097371 MDM2/MDM4 family protein binding 4 of 12 2.54 2.39 5.49e-06 TP53 Methyl Blue 1
GO:0002039 p53 binding 5 of 70 1.87 1.93 1.91e-05 TP53 Methyl Blue 0
GO:0051400 BH domain binding 3 of 10 2.49 1.78 0.00013 TP53 Methyl Blue 1
GO:0031625 Ubiquitin protein ligase binding 7 of 299 1.39 1.51 1.91e-05 TP53 Methyl Blue 0
GO:0097718 Disordered domain specific binding 3 of 34 1.96 1.13 0.0028 TP53 Methyl Blue 1
GO:0047485 Protein N-terminus binding 4 of 109 1.58 1.09 0.0020 TP53 Methyl Blue 0
GO:0019904 Protein domain specific binding 8 of 695 1.08 1.04 0.00011 TP53 Methyl Blue 0
GO:0051434 BH3 domain binding 2 of 6 2.54 0.96 0.0086 TP53 Methyl Blue 1
GO:0004861 Cyclin-dependent protein serine/threonine kinase inhibitor activity 2 of 12 2.24 0.78 0.0206 TP53 Methyl Blue 1
GO:0019899 Enzyme binding 12 of 2084 0.78 0.74 4.91e-05 TP53 Methyl Blue 0
GO:0046982 Protein heterodimerization activity 5 of 368 1.15 0.72 0.0084 TP53 Methyl Blue 0
GO:0008134 Transcription factor binding 6 of 587 1.03 0.7 0.0058 TP53 Methyl Blue 0
GO:0001228 DNA-binding transcription activator activity, RNA polymerase II-specific 5 of 458 1.05 0.59 0.0203 TP53 Methyl Blue 0
GO:0140297 DNA-binding transcription factor binding 5 of 484 1.03 0.58 0.0206 TP53 Methyl Blue 0
GO:0019900 Kinase binding 6 of 785 0.9 0.53 0.0203 TP53 Methyl Blue 0
GO:0042802 Identical protein binding 10 of 2144 0.68 0.52 0.0039 TP53 Methyl Blue 0
GO:0044877 Protein-containing complex binding 7 of 1261 0.76 0.45 0.0266 TP53 Methyl Blue 0
GO:0005515 Protein binding 18 of 7242 0.41 0.4 0.00013 TP53 Methyl Blue 0
GO:0097136 Bcl-2 family protein complex 3 of 10 2.49 1.81 0.00011 TP53 Methyl Blue 1
GO:0005741 Mitochondrial outer membrane 6 of 209 1.47 1.45 7.73e-05 TP53 Methyl Blue 0
GO:0005635 Nuclear envelope 6 of 487 1.11 0.89 0.0012 TP53 Methyl Blue 0
GO:0017053 Transcription repressor complex 3 of 77 1.61 0.83 0.0104 TP53 Methyl Blue 1
GO:0005730 Nucleolus 8 of 996 0.92 0.79 0.00059 TP53 Methyl Blue 0
GO:0031967 Organelle envelope 9 of 1262 0.87 0.76 0.00039 TP53 Methyl Blue 0
GO:0046930 Pore complex 2 of 27 1.89 0.65 0.0360 TP53 Methyl Blue 1
GO:0031965 Nuclear membrane 4 of 305 1.13 0.59 0.0244 TP53 Methyl Blue 0
GO:0005739 Mitochondrion 9 of 1681 0.74 0.58 0.0023 TP53 Methyl Blue 0
GO:0005783 Endoplasmic reticulum 8 of 2021 0.61 0.37 0.0356 TP53 Methyl Blue 0
GO:0005654 Nucleoplasm 12 of 4169 0.47 0.35 0.0124 TP53 Methyl Blue 0
GO:0032991 Protein-containing complex 14 of 5506 0.42 0.33 0.0095 TP53 Methyl Blue 0
GO:0005634 Nucleus 16 of 7672 0.33 0.29 0.0110 TP53 Methyl Blue 0
GO:0005829 Cytosol 13 of 5438 0.39 0.29 0.0287 TP53 Methyl Blue 0
GO:0043227 Membrane-bounded organelle 19 of 13188 0.17 0.2 0.0479 TP53 Methyl Blue 0
BTO:0002254 Vas efferens 3 of 5 2.79 1.85 0.00010 TP53 Methyl Blue 1
BTO:0001967 Cervical cancer cell line 3 of 10 2.49 1.78 0.00013 TP53 Methyl Blue 1
BTO:0000407 Osteosarcoma cell line 3 of 16 2.29 1.62 0.00025 TP53 Methyl Blue 1
BTO:0001913 Colonic adenocarcinoma cell line 3 of 27 2.06 1.39 0.00073 TP53 Methyl Blue 1
BTO:0006256 Solid cancer cell 2 of 2 3.02 1.38 0.0011 TP53 Methyl Blue 1
BTO:0005257 Oropharynx 2 of 2 3.02 1.38 0.0011 TP53 Methyl Blue 1
BTO:0000458 WI-38 cell 2 of 2 3.02 1.38 0.0011 TP53 Methyl Blue 1
BTO:0001938 U2-OS cell 2 of 5 2.62 1.26 0.0019 TP53 Methyl Blue 1
BTO:0001332 DU-145 cell 2 of 5 2.62 1.26 0.0019 TP53 Methyl Blue 1
BTO:0000182 HT-29 cell 2 of 5 2.62 1.26 0.0019 TP53 Methyl Blue 1
BTO:0003080 Pleural fluid 2 of 6 2.54 1.22 0.0023 TP53 Methyl Blue 1
BTO:0000567 HeLa cell 2 of 6 2.54 1.22 0.0023 TP53 Methyl Blue 1
BTO:0000583 Bone marrow cancer cell 7 of 442 1.22 1.19 0.00010 TP53 Methyl Blue 0
BTO:0004725 Embryonic fibroblast 2 of 7 2.47 1.19 0.0026 TP53 Methyl Blue 1
BTO:0001615 Colorectal cancer cell 2 of 7 2.47 1.19 0.0026 TP53 Methyl Blue 1
BTO:0001130 Prostate gland cancer cell 2 of 7 2.47 1.19 0.0026 TP53 Methyl Blue 1
BTO:0000599 Hep-G2 cell 2 of 7 2.47 1.19 0.0026 TP53 Methyl Blue 1
BTO:0000018 A-549 cell 2 of 7 2.47 1.19 0.0026 TP53 Methyl Blue 1
BTO:0000773 Lymphoblastoid cell line 3 of 49 1.8 1.17 0.0018 TP53 Methyl Blue 1
BTO:0000093 MCF-7 cell 2 of 11 2.28 1.08 0.0044 TP53 Methyl Blue 1
BTO:0000669 Embryonic cell line 3 of 73 1.63 1.02 0.0035 TP53 Methyl Blue 1
BTO:0001049 Pharynx 3 of 75 1.62 1.0 0.0037 TP53 Methyl Blue 1
BTO:0004254 Cancer stem cell 2 of 16 2.11 0.96 0.0077 TP53 Methyl Blue 1
BTO:0000353 Lung cell line 3 of 85 1.56 0.94 0.0049 TP53 Methyl Blue 1
BTO:0000580 Blood cancer cell 10 of 1234 0.92 0.88 0.00010 TP53 Methyl Blue 0
BTO:0001271 Leukemia cell 9 of 1067 0.94 0.88 0.00013 TP53 Methyl Blue 0
BTO:0000452 Fibroblast 4 of 224 1.27 0.86 0.0039 TP53 Methyl Blue 0
BTO:0000737 Leukemia cell line 3 of 108 1.46 0.83 0.0085 TP53 Methyl Blue 1
BTO:0000426 Erythroleukemia cell 4 of 244 1.23 0.82 0.0049 TP53 Methyl Blue 0
BTO:0001340 Bronchus 2 of 25 1.92 0.82 0.0148 TP53 Methyl Blue 1
BTO:0002144 Acute lymphoblastic leukemia cell line 2 of 34 1.79 0.71 0.0244 TP53 Methyl Blue 1
BTO:0000740 Myeloid leukemia cell line 2 of 36 1.76 0.7 0.0253 TP53 Methyl Blue 1
BTO:0000180 Cervical carcinoma cell 4 of 322 1.11 0.67 0.0118 TP53 Methyl Blue 0
BTO:0000744 Lymphocytic leukemia cell 4 of 419 1.0 0.54 0.0253 TP53 Methyl Blue 0
BTO:0001541 Pronephros 3 of 219 1.15 0.52 0.0432 TP53 Methyl Blue 0
BTO:0001546 Chronic lymphocytic leukemia cell 3 of 222 1.15 0.52 0.0443 TP53 Methyl Blue 0
BTO:0000284 Organism form 10 of 2542 0.61 0.48 0.0028 TP53 Methyl Blue 0
BTO:0001253 Skin 6 of 1151 0.73 0.44 0.0249 TP53 Methyl Blue 0
BTO:0000379 Embryo 5 of 824 0.8 0.44 0.0339 TP53 Methyl Blue 0
BTO:0000174 Embryonic structure 9 of 2369 0.6 0.43 0.0085 TP53 Methyl Blue 0
BTO:0003099 Internal female genital organ 9 of 2804 0.52 0.35 0.0249 TP53 Methyl Blue 0
BTO:0000083 Female reproductive system 13 of 6111 0.34 0.27 0.0332 TP53 Methyl Blue 0
"""
def parse_data(data):
lines = data.strip().split('\n')
data_list = []
for line in lines:
parts = line.split('\t')
successes = int(parts[2].split(' of ')[0])
total = int(parts[2].split(' of ')[1])
value1 = float(parts[3])
value2 = float(parts[4])
p_value = float(parts[5])
protein = parts[6]
drug = parts[7]
target = int(parts[8]) # Directly use provided target
biomaterial = parts[0] # Capture biomaterial ID
data_list.append([successes, total, value1, value2, p_value, protein, drug, target, biomaterial]) #Added biomaterial
return data_list
data_list = parse_data(data)
df = pd.DataFrame(data_list, columns=['Successes', 'Total', 'Value1', 'Value2', 'P_Value', 'Protein', 'Drug', 'Target', 'Biomaterial']) #Added Biomaterial column
# Data Preprocessing - Encoding Categorical Features
label_encoder = LabelEncoder()
df['Protein'] = label_encoder.fit_transform(df['Protein'])
df['Drug'] = label_encoder.fit_transform(df['Drug'])
df['Biomaterial'] = label_encoder.fit_transform(df['Biomaterial']) #Encoded Biomaterial
# Feature Engineering
df['Accuracy'] = df['Successes'] / df['Total']
df['LogTotal'] = np.log1p(df['Total'])
df['LogPValue'] = -np.log10(df['P_Value'])
df['Value1_x_Value2'] = df['Value1'] * df['Value2']
df['Success_Ratio'] = df['Successes'] / (df['Total'] + 1e-6)
df['Value1_Div_Value2'] = df['Value1'] / (df['Value2'] + 1e-6)
df['Combined_Value'] = df['Value1'] + df['Value2'] + df['LogTotal']
# Define features and target
X = df[['Successes', 'Total', 'Value1', 'Value2', 'P_Value', 'Protein', 'Drug', 'Biomaterial', 'Accuracy', 'LogTotal', 'LogPValue', 'Value1_x_Value2', 'Success_Ratio', 'Value1_Div_Value2','Combined_Value']] #Added Biomaterial
y = df['Target']
# Data Splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
# Custom F1 Scorer
def safe_f1(y_true, y_pred):
f1 = f1_score(y_true, y_pred, average='binary', zero_division=0)
return f1
custom_f1 = make_scorer(safe_f1)
# Pipelines
def create_pipeline(model, select_features=False, k=10, l2_penalty=0.0): # Added L2 regularization option
steps = [
('imputer', SimpleImputer(strategy='median')),
('scaler', StandardScaler()),
('transformer', QuantileTransformer(output_distribution='normal'))
]
if select_features:
steps.append(('feature_selection', SelectKBest(score_func=f_classif, k=k)))
# Add L2 regularization to Logistic Regression (if applicable)
if isinstance(model, LogisticRegression):
model.penalty = 'l2' #Apply L2
model.C = 1.0 / (l2_penalty + 1e-9) #Convert L2 to C parameter
steps.append(('model', model))
pipeline = Pipeline(steps)
return pipeline
# Models
svm_model = SVC(kernel='rbf', C=0.1, gamma='scale', random_state=42) #Simplified SVM
rf_model = RandomForestClassifier(n_estimators=50, max_depth=5, min_samples_leaf=5, random_state=42, class_weight='balanced') #Simplified RF
gb_model = GradientBoostingClassifier(n_estimators=50, learning_rate=0.1, max_depth=3, random_state=42) #Simplified GB
nn_model = MLPClassifier(hidden_layer_sizes=(32,), activation='relu', solver='adam', random_state=42, max_iter=200, early_stopping=True, alpha=0.01) #Simplified NN with L2
# Create Pipelines WITH feature selection. Tune the 'k' value!
svm_pipeline = create_pipeline(svm_model, select_features=True, k=7)
rf_pipeline = create_pipeline(rf_model, select_features=True, k=7)
gb_pipeline = create_pipeline(gb_model, select_features=True, k=7)
nn_pipeline = create_pipeline(nn_model, select_features=True, k=7)
# Training and Evaluation
def train_evaluate_model(pipeline, model_name, X_train, y_train, X_test, y_test, cv=10): # Increased CV folds
print(f"Training and Evaluating {model_name}...")
# Cross-validation
cv_scores = cross_val_score(pipeline, X_train, y_train, cv=KFold(n_splits=cv, shuffle=True, random_state=42), scoring=custom_f1)
print(f"{model_name} Cross-Validation F1 Scores: {cv_scores}")
print(f"{model_name} Mean Cross-Validation F1 Score: {cv_scores.mean():.4f}")
# Train on full training data
pipeline.fit(X_train, y_train)
# Predictions and Evaluation
y_pred = pipeline.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
f1 = safe_f1(y_test, y_pred)
print(f"{model_name} Accuracy on Test Set: {accuracy:.4f}")
print(f"{model_name} F1 Score on Test Set: {f1:.4f}")
print(f"{model_name} Classification Report:\n{classification_report(y_test, y_pred)}")
return pipeline
# Train and Evaluate
svm_pipeline = train_evaluate_model(svm_pipeline, "SVM", X_train, y_train, X_test, y_test)
rf_pipeline = train_evaluate_model(rf_pipeline, "Random Forest", X_train, y_train, X_test, y_test)
gb_pipeline = train_evaluate_model(gb_pipeline, "Gradient Boosting", X_train, y_train, X_test, y_test)
nn_pipeline = train_evaluate_model(nn_pipeline, "Neural Network", X_train, y_train, X_test, y_test)
# --- Feature Importance (Random Forest) ---
print("\nFeature Importance (Random Forest - Trained on ALL data):") #Use RF for Feature Importance
rf_pipeline.fit(X, y) #Fit on all Data for final training
if hasattr(rf_pipeline.named_steps['model'], 'feature_importances_'):
importance = rf_pipeline.named_steps['model'].feature_importances_
feature_names = X.columns
for i, v in enumerate(importance):
print(f"Feature: {feature_names[i]}, Score: {v:.4f}")
else:
print("Random Forest Model does not have feature_importances_ attribute.")
No comments:
Post a Comment