Devise Foundation: MOLCULAR AND CELLULAR RELATED CODE

import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import StandardScaler, QuantileTransformer, LabelEncoder
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score, classification_report, make_scorer, f1_score
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import SelectKBest, f_classif
import numpy as np
import warnings
warnings.filterwarnings("ignore")

# Expanded Data (Example - replace with your actual data)
data = """
GO:0097371  MDM2/MDM4 family protein binding  4 of 12 2.54  2.39  5.49e-06  TP53  Methyl Blue 1
GO:0002039  p53 binding 5 of 70 1.87  1.93  1.91e-05  TP53  Methyl Blue 0
GO:0051400  BH domain binding 3 of 10 2.49  1.78  0.00013 TP53  Methyl Blue 1
GO:0031625  Ubiquitin protein ligase binding  7 of 299  1.39  1.51  1.91e-05  TP53  Methyl Blue 0
GO:0097718  Disordered domain specific binding  3 of 34 1.96  1.13  0.0028  TP53  Methyl Blue 1
GO:0047485  Protein N-terminus binding  4 of 109  1.58  1.09  0.0020  TP53  Methyl Blue 0
GO:0019904  Protein domain specific binding 8 of 695  1.08  1.04  0.00011 TP53  Methyl Blue 0
GO:0051434  BH3 domain binding  2 of 6  2.54  0.96  0.0086  TP53  Methyl Blue 1
GO:0004861  Cyclin-dependent protein serine/threonine kinase inhibitor activity 2 of 12 2.24  0.78  0.0206  TP53  Methyl Blue 1
GO:0019899  Enzyme binding  12 of 2084  0.78  0.74  4.91e-05  TP53  Methyl Blue 0
GO:0046982  Protein heterodimerization activity 5 of 368  1.15  0.72  0.0084  TP53  Methyl Blue 0
GO:0008134  Transcription factor binding  6 of 587  1.03  0.7 0.0058  TP53  Methyl Blue 0
GO:0001228  DNA-binding transcription activator activity, RNA polymerase II-specific  5 of 458  1.05  0.59  0.0203  TP53  Methyl Blue 0
GO:0140297  DNA-binding transcription factor binding  5 of 484  1.03  0.58  0.0206  TP53  Methyl Blue 0
GO:0019900  Kinase binding  6 of 785  0.9 0.53  0.0203  TP53  Methyl Blue 0
GO:0042802  Identical protein binding 10 of 2144  0.68  0.52  0.0039  TP53  Methyl Blue 0
GO:0044877  Protein-containing complex binding  7 of 1261 0.76  0.45  0.0266  TP53  Methyl Blue 0
GO:0005515  Protein binding 18 of 7242  0.41  0.4 0.00013 TP53  Methyl Blue 0
GO:0097136  Bcl-2 family protein complex  3 of 10 2.49  1.81  0.00011 TP53  Methyl Blue 1
GO:0005741  Mitochondrial outer membrane  6 of 209  1.47  1.45  7.73e-05  TP53  Methyl Blue 0
GO:0005635  Nuclear envelope  6 of 487  1.11  0.89  0.0012  TP53  Methyl Blue 0
GO:0017053  Transcription repressor complex 3 of 77 1.61  0.83  0.0104  TP53  Methyl Blue 1
GO:0005730  Nucleolus 8 of 996  0.92  0.79  0.00059 TP53  Methyl Blue 0
GO:0031967  Organelle envelope  9 of 1262 0.87  0.76  0.00039 TP53  Methyl Blue 0
GO:0046930  Pore complex  2 of 27 1.89  0.65  0.0360  TP53  Methyl Blue 1
GO:0031965  Nuclear membrane  4 of 305  1.13  0.59  0.0244  TP53  Methyl Blue 0
GO:0005739  Mitochondrion 9 of 1681 0.74  0.58  0.0023  TP53  Methyl Blue 0
GO:0005783  Endoplasmic reticulum 8 of 2021 0.61  0.37  0.0356  TP53  Methyl Blue 0
GO:0005654  Nucleoplasm 12 of 4169  0.47  0.35  0.0124  TP53  Methyl Blue 0
GO:0032991  Protein-containing complex  14 of 5506  0.42  0.33  0.0095  TP53  Methyl Blue 0
GO:0005634  Nucleus 16 of 7672  0.33  0.29  0.0110  TP53  Methyl Blue 0
GO:0005829  Cytosol 13 of 5438  0.39  0.29  0.0287  TP53  Methyl Blue 0
GO:0043227  Membrane-bounded organelle  19 of 13188 0.17  0.2 0.0479  TP53  Methyl Blue 0
BTO:0002254 Vas efferens  3 of 5  2.79  1.85  0.00010 TP53  Methyl Blue 1
BTO:0001967 Cervical cancer cell line 3 of 10 2.49  1.78  0.00013 TP53  Methyl Blue 1
BTO:0000407 Osteosarcoma cell line  3 of 16 2.29  1.62  0.00025 TP53  Methyl Blue 1
BTO:0001913 Colonic adenocarcinoma cell line  3 of 27 2.06  1.39  0.00073 TP53  Methyl Blue 1
BTO:0006256 Solid cancer cell 2 of 2  3.02  1.38  0.0011  TP53  Methyl Blue 1
BTO:0005257 Oropharynx  2 of 2  3.02  1.38  0.0011  TP53  Methyl Blue 1
BTO:0000458 WI-38 cell  2 of 2  3.02  1.38  0.0011  TP53  Methyl Blue 1
BTO:0001938 U2-OS cell  2 of 5  2.62  1.26  0.0019  TP53  Methyl Blue 1
BTO:0001332 DU-145 cell 2 of 5  2.62  1.26  0.0019  TP53  Methyl Blue 1
BTO:0000182 HT-29 cell  2 of 5  2.62  1.26  0.0019  TP53  Methyl Blue 1
BTO:0003080 Pleural fluid 2 of 6  2.54  1.22  0.0023  TP53  Methyl Blue 1
BTO:0000567 HeLa cell 2 of 6  2.54  1.22  0.0023  TP53  Methyl Blue 1
BTO:0000583 Bone marrow cancer cell 7 of 442  1.22  1.19  0.00010 TP53  Methyl Blue 0
BTO:0004725 Embryonic fibroblast  2 of 7  2.47  1.19  0.0026  TP53  Methyl Blue 1
BTO:0001615 Colorectal cancer cell  2 of 7  2.47  1.19  0.0026  TP53  Methyl Blue 1
BTO:0001130 Prostate gland cancer cell  2 of 7  2.47  1.19  0.0026  TP53  Methyl Blue 1
BTO:0000599 Hep-G2 cell 2 of 7  2.47  1.19  0.0026  TP53  Methyl Blue 1
BTO:0000018 A-549 cell  2 of 7  2.47  1.19  0.0026  TP53  Methyl Blue 1
BTO:0000773 Lymphoblastoid cell line  3 of 49 1.8 1.17  0.0018  TP53  Methyl Blue 1
BTO:0000093 MCF-7 cell  2 of 11 2.28  1.08  0.0044  TP53  Methyl Blue 1
BTO:0000669 Embryonic cell line 3 of 73 1.63  1.02  0.0035  TP53  Methyl Blue 1
BTO:0001049 Pharynx 3 of 75 1.62  1.0 0.0037  TP53  Methyl Blue 1
BTO:0004254 Cancer stem cell  2 of 16 2.11  0.96  0.0077  TP53  Methyl Blue 1
BTO:0000353 Lung cell line  3 of 85 1.56  0.94  0.0049  TP53  Methyl Blue 1
BTO:0000580 Blood cancer cell 10 of 1234  0.92  0.88  0.00010 TP53  Methyl Blue 0
BTO:0001271 Leukemia cell 9 of 1067 0.94  0.88  0.00013 TP53  Methyl Blue 0
BTO:0000452 Fibroblast  4 of 224  1.27  0.86  0.0039  TP53  Methyl Blue 0
BTO:0000737 Leukemia cell line  3 of 108  1.46  0.83  0.0085  TP53  Methyl Blue 1
BTO:0000426 Erythroleukemia cell  4 of 244  1.23  0.82  0.0049  TP53  Methyl Blue 0
BTO:0001340 Bronchus  2 of 25 1.92  0.82  0.0148  TP53  Methyl Blue 1
BTO:0002144 Acute lymphoblastic leukemia cell line  2 of 34 1.79  0.71  0.0244  TP53  Methyl Blue 1
BTO:0000740 Myeloid leukemia cell line  2 of 36 1.76  0.7 0.0253  TP53  Methyl Blue 1
BTO:0000180 Cervical carcinoma cell 4 of 322  1.11  0.67  0.0118  TP53  Methyl Blue 0
BTO:0000744 Lymphocytic leukemia cell 4 of 419  1.0 0.54  0.0253  TP53  Methyl Blue 0
BTO:0001541 Pronephros  3 of 219  1.15  0.52  0.0432  TP53  Methyl Blue 0
BTO:0001546 Chronic lymphocytic leukemia cell 3 of 222  1.15  0.52  0.0443  TP53  Methyl Blue 0
BTO:0000284 Organism form 10 of 2542  0.61  0.48  0.0028  TP53  Methyl Blue 0
BTO:0001253 Skin  6 of 1151 0.73  0.44  0.0249  TP53  Methyl Blue 0
BTO:0000379 Embryo  5 of 824  0.8 0.44  0.0339  TP53  Methyl Blue 0
BTO:0000174 Embryonic structure 9 of 2369 0.6 0.43  0.0085  TP53  Methyl Blue 0
BTO:0003099 Internal female genital organ 9 of 2804 0.52  0.35  0.0249  TP53  Methyl Blue 0
BTO:0000083 Female reproductive system  13 of 6111  0.34  0.27  0.0332  TP53  Methyl Blue 0
"""

def parse_data(data):
    lines = data.strip().split('\n')
    data_list = []
    for line in lines:
        parts = line.split('\t')
        successes = int(parts[2].split(' of ')[0])
        total = int(parts[2].split(' of ')[1])
        value1 = float(parts[3])
        value2 = float(parts[4])
        p_value = float(parts[5])
        protein = parts[6]
        drug = parts[7]
        target = int(parts[8])  # Directly use provided target
        biomaterial = parts[0]  # Capture biomaterial ID
        data_list.append([successes, total, value1, value2, p_value, protein, drug, target, biomaterial]) #Added biomaterial
    return data_list

data_list = parse_data(data)
df = pd.DataFrame(data_list, columns=['Successes', 'Total', 'Value1', 'Value2', 'P_Value', 'Protein', 'Drug', 'Target', 'Biomaterial']) #Added Biomaterial column

# Data Preprocessing - Encoding Categorical Features
label_encoder = LabelEncoder()
df['Protein'] = label_encoder.fit_transform(df['Protein'])
df['Drug'] = label_encoder.fit_transform(df['Drug'])
df['Biomaterial'] = label_encoder.fit_transform(df['Biomaterial']) #Encoded Biomaterial

# Feature Engineering
df['Accuracy'] = df['Successes'] / df['Total']
df['LogTotal'] = np.log1p(df['Total'])
df['LogPValue'] = -np.log10(df['P_Value'])
df['Value1_x_Value2'] = df['Value1'] * df['Value2']
df['Success_Ratio'] = df['Successes'] / (df['Total'] + 1e-6)
df['Value1_Div_Value2'] = df['Value1'] / (df['Value2'] + 1e-6)
df['Combined_Value'] = df['Value1'] + df['Value2'] + df['LogTotal']

# Define features and target
X = df[['Successes', 'Total', 'Value1', 'Value2', 'P_Value', 'Protein', 'Drug', 'Biomaterial', 'Accuracy', 'LogTotal', 'LogPValue', 'Value1_x_Value2', 'Success_Ratio', 'Value1_Div_Value2','Combined_Value']] #Added Biomaterial
y = df['Target']

# Data Splitting
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

# Custom F1 Scorer
def safe_f1(y_true, y_pred):
    f1 = f1_score(y_true, y_pred, average='binary', zero_division=0)
    return f1
custom_f1 = make_scorer(safe_f1)

# Pipelines
def create_pipeline(model, select_features=False, k=10, l2_penalty=0.0):  # Added L2 regularization option
    steps = [
        ('imputer', SimpleImputer(strategy='median')),
        ('scaler', StandardScaler()),
        ('transformer', QuantileTransformer(output_distribution='normal'))
    ]
    if select_features:
        steps.append(('feature_selection', SelectKBest(score_func=f_classif, k=k)))

    # Add L2 regularization to Logistic Regression (if applicable)
    if isinstance(model, LogisticRegression):
        model.penalty = 'l2' #Apply L2
        model.C = 1.0 / (l2_penalty + 1e-9)  #Convert L2 to C parameter

    steps.append(('model', model))
    pipeline = Pipeline(steps)
    return pipeline

# Models
svm_model = SVC(kernel='rbf', C=0.1, gamma='scale', random_state=42) #Simplified SVM
rf_model = RandomForestClassifier(n_estimators=50, max_depth=5, min_samples_leaf=5, random_state=42, class_weight='balanced') #Simplified RF
gb_model = GradientBoostingClassifier(n_estimators=50, learning_rate=0.1, max_depth=3, random_state=42) #Simplified GB
nn_model = MLPClassifier(hidden_layer_sizes=(32,), activation='relu', solver='adam', random_state=42, max_iter=200, early_stopping=True, alpha=0.01) #Simplified NN with L2

# Create Pipelines WITH feature selection.  Tune the 'k' value!
svm_pipeline = create_pipeline(svm_model, select_features=True, k=7)
rf_pipeline = create_pipeline(rf_model, select_features=True, k=7)
gb_pipeline = create_pipeline(gb_model, select_features=True, k=7)
nn_pipeline = create_pipeline(nn_model, select_features=True, k=7)

# Training and Evaluation
def train_evaluate_model(pipeline, model_name, X_train, y_train, X_test, y_test, cv=10): # Increased CV folds
    print(f"Training and Evaluating {model_name}...")

    # Cross-validation
    cv_scores = cross_val_score(pipeline, X_train, y_train, cv=KFold(n_splits=cv, shuffle=True, random_state=42), scoring=custom_f1)
    print(f"{model_name} Cross-Validation F1 Scores: {cv_scores}")
    print(f"{model_name} Mean Cross-Validation F1 Score: {cv_scores.mean():.4f}")

    # Train on full training data
    pipeline.fit(X_train, y_train)

    # Predictions and Evaluation
    y_pred = pipeline.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    f1 = safe_f1(y_test, y_pred)
    print(f"{model_name} Accuracy on Test Set: {accuracy:.4f}")
    print(f"{model_name} F1 Score on Test Set: {f1:.4f}")
    print(f"{model_name} Classification Report:\n{classification_report(y_test, y_pred)}")

    return pipeline

# Train and Evaluate
svm_pipeline = train_evaluate_model(svm_pipeline, "SVM", X_train, y_train, X_test, y_test)
rf_pipeline = train_evaluate_model(rf_pipeline, "Random Forest", X_train, y_train, X_test, y_test)
gb_pipeline = train_evaluate_model(gb_pipeline, "Gradient Boosting", X_train, y_train, X_test, y_test)
nn_pipeline = train_evaluate_model(nn_pipeline, "Neural Network", X_train, y_train, X_test, y_test)

# --- Feature Importance (Random Forest) ---
print("\nFeature Importance (Random Forest - Trained on ALL data):") #Use RF for Feature Importance
rf_pipeline.fit(X, y) #Fit on all Data for final training

if hasattr(rf_pipeline.named_steps['model'], 'feature_importances_'):
    importance = rf_pipeline.named_steps['model'].feature_importances_
    feature_names = X.columns

    for i, v in enumerate(importance):
        print(f"Feature: {feature_names[i]}, Score: {v:.4f}")
else:
    print("Random Forest Model does not have feature_importances_ attribute.")

Molecular AND CELLULAR RELATED CODE © 2025 by Devise Foundation is licensed under CC BY-NC-ND 4.0 
Devise Foundation

Monday, February 24, 2025

MOLCULAR AND CELLULAR RELATED CODE

No comments:

Post a Comment

From Paikpara’s Lanes to Titagarh’s Bazaar—My Food Memories

Report Abuse

Labels

Popular Posts