import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error
from sklearn.impute import KNNImputer
from sklearn.linear_model import Ridge
import json
# Start Code:
data = {
'Binding Affinity': [-8.297, -8.049, -7.930, -7.724, -7.612, -7.386, -7.342, -7.218, -7.155, -7.042,
-6.940, -6.933, -6.885, -6.846, -6.808, -6.770, -6.536, -6.533, -6.529, -6.407],
'Volume': [1964.45, 1780.27] * 10,
'Surface': [2366.68, 2400.50] * 10,
'Depth': [33.29, 33.51] * 10,
'Ellipsoid Ratio c/a': [0.15, 0.11] * 10,
'Ellipsoid Ratio b/a': [0.26, 0.19] * 10,
'Enclosure': [0.06, 0.16] * 10,
'Pocket Atoms': [437, 310] * 10,
'Carbons': [286, 217] * 10,
'Nitrogens': [61, 44] * 10,
'Oxygens': [83, 47] * 10,
'Sulfurs': [2, 0] * 10,
'Other Elements': [5, 2] * 10,
'Hydrogen Bond Donors': [61, 47] * 10,
'Hydrogen Bond Acceptors': [150, 95] * 10,
'Metals': [4, 2] * 10,
'Hydrophobic Interactions': [109, 111] * 10,
'Hydrophobicity Ratio': [0.34, 0.44] * 10,
'Apolar Amino Acid Ratio': [0.26, 0.44] * 10,
'Polar Amino Acid Ratio': [0.33, 0.29] * 10,
'Positive Amino Acid Ratio': [0.15, 0.20] * 10,
'Negative Amino Acid Ratio': [0.21, 0.05] * 10,
#Updated AA Counts feature (USE This One For ALL Models : Data is consistent across Models)
'ALA':[0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3,0,3],
'ARG':[5,2,5,2,5,2,5,2,5,2,5,2,5,2,5,2,5,2,5,2],
'ASN':[4,2,4,2,4,2,4,2,4,2,4,2,4,2,4,2,4,2,4,2],
'ASP':[15,2,15,2,15,2,15,2,15,2,15,2,15,2,15,2,15,2,15,2],
'CYS':[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0],
'GLN':[0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2,0,2],
'GLU':[5,1,5,1,5,1,5,1,5,1,5,1,5,1,5,1,5,1,5,1],
'GLY':[11,6,11,6,11,6,11,6,11,6,11,6,11,6,11,6,11,6,11,6],
'HIS':[6,3,6,3,6,3,6,3,6,3,6,3,6,3,6,3,6,3,6,3],
'ILE':[8,2,8,2,8,2,8,2,8,2,8,2,8,2,8,2,8,2,8,2],
'LEU':[2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9,2,9],
'LYS':[3,8,3,8,3,8,3,8,3,8,3,8,3,8,3,8,3,8,3,8],
'MET':[2,1,2,1,2,1,2,1,2,1,2,1,2,1,2,1,2,1,2,1],
'PHE':[5,3,5,3,5,3,5,3,5,3,5,3,5,3,5,3,5,3,5,3],
'PRO':[5,6,5,6,5,6,5,6,5,6,5,6,5,6,5,6,5,6,5,6],
'SER':[5,3,5,3,5,3,5,3,5,3,5,3,5,3,5,3,5,3,5,3],
'THR':[3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3,3],
'TRP':[0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1,0,1],
'TYR':[8,3,8,3,8,3,8,3,8,3,8,3,8,3,8,3,8,3,8,3],
'VAL':[3,4,3,4,3,4,3,4,3,4,3,4,3,4,3,4,3,4,3,4],
#From OCR
'Phe231A': [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1], # From Image: present
'Leu218A': [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1], # Present
'Leu197A': [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1], # Present
'Thr226A': [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1], # Present
'His201A': [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1], # Present
'Tyr223A': [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1], # Present
'Lys119A': [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1], # Present
'Lys228A': [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1], # Present
#What is known for certain from ligand
'LIG1': [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1],
'C': [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1],
'N': [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1],
'O': [1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1],
#Parameters
'vina_dock_model': list(range(1, 21)),
'Model': list(range(1, 21)),
'Molweight': [371.52]*20,
'Number of hydrogen bond acceptors': [2]*20,
'Number of hydrogen bond donors': [0]*20,
'Number of rotable bonds': [8]*20,
'hydrophobic_interactions': [109, 111]*10,
'RESIDUE': [662.408,676.521,667.184,657.759,661.434,662.727,674.381,670.491,671.178,660.898,671.713,672.719,666.313,667.567,667.751,663.67,666.593,665.243,662.001,666.218],
#Coordinates to chain and chain position.
'coord_x': [12.3773,14.1198,13.8247,13.4709,12.7294,13.2786,14.5773,12.9101,13.0883,14.0219,11.6945,13.5996,13.2321,13.9812,13.5193,12.9665,14.3094,14.8302,11.615,12.9273],
'coord_y': [27.1536,30.3026,29.6459,28.5711,27.6687,19.7112,31.0198,21.5947,29.2522,29.7841,26.8426,21.8593,28.5237,19.6639,27.1798,28.8583,22.542,26.5058,23.464,29.566],
'coord_z': [60.7158,62.2514,61.8274,61.021,61.2426,61.5766,62.0441,61.4143,61.6266,62.4846,61.2329,62.2616,61.1592,62.5268,61.9475,61.4963,62.572,64.2347,61.1289,61.4714]
}
#ADMET data: properties needed
admet_data = {
'Molecular Weight (MW)': [424.14]*20,
'Volume_ADMET': [444.27]*20,
'Density': [0.955]*20,
'nHA': [6.0]*20,
'nHD': [1.0]*20,
'nRot': [4.0]*20,
'nRing': [4.0]*20,
'MaxRing': [10.0]*20,
'nHet': [6.0]*20,
'fChar': [0.0]*20,
'nRig': [27.0]*20,
'Flexibility': [0.148]*20,
'Stereo Centers': [0.0]*20,
'TPSA': [81.3]*20,
'logS': [-4.438]*20,
'logP': [4.138]*20,
'logD7.4': [3.039]*20,
'pka (Acid)': [4.266]*20,
'pka (Base)': [1.947]*20,
'Melting point': [198.253]*20,
'Boiling point': [314.451]*20,
'QED': [0.511]*20,
'SAscore': [1]*20,
'GASA': [1]*20,
'Fsp3': [0.115]*20,
'MCE-18': [23.0]*20,
'NPscore': [-0.742]*20,
'Colloidal aggregators': [0.73]*20,
'FLuc inhibitors': [0.175]*20,
'Blue fluorescence': [0.277]*20,
'Green fluorescence': [0.766]*20,
'Reactive compounds': [0.004]*20,
'Promiscuous compounds': [0.011]*20,
'Caco-2 Permeability': [-4.882]*20,
'MDCK Permeability': [-4.666]*20,
'PPB': [99.0]*20,
'VDss': [-0.35]*20,
'Fu': [0.5]*20,
'CLplasma': [0.555]*20,
'T1/2': [1.346]*20,
'Aquatic Toxicity Rule': [0]*20,
'Genotoxic Carcinogenicity Mutagenicity Rule': [0]*20,
'NonGenotoxic Carcinogenicity Rule': [0]*20,
'Skin Sensitization Rule': [1]*20,
'Acute Toxicity Rule': [0]*20,
'NonBiodegradable': [0]*20,
'SureChEMBL Rule': [0]*20,
'FAF-Drugs4 Rule': [4]*20,
'hERG Blockers': [0.521]*20,
'hERG Blockers (10um)': [0.315]*20,
'DILI': [0.962]*20,
'AMES Toxicity': [0.81]*20,
'Rat Oral Acute Toxicity': [0.226]*20,
'FDAMDD': [0.841]*20,
'Skin Sensitization_ADMET': [0.354]*20,
'Carcinogenicity_ADMET': [0.388]*20,
'Eye Corrosion': [0.0]*20,
'Eye Irritation': [0.078]*20,
'Respiratory': [0.436]*20,
'Human Hepatotoxicity': [0.875]*20,
'Drug-induced Nephrotoxicity': [0.878]*20,
'Drug-induced Neurotoxicity': [0.911]*20,
'Ototoxicity': [0.864]*20,
'Hematotoxicity': [0.41]*20,
'Genotoxicity_ADMET': [0.994]*20,
'RPMI-8226 Immunitoxicity': [0.098]*20,
'A549 Cytotoxicity': [0.114]*20,
'Hek293 Cytotoxicity': [0.529]*20,
'BCF': [0.987]*20,
'IGC50': [4.088]*20,
'LC50DM': [5.143]*20,
'LC50FM': [4.794]*20,
'Molweight_provided': [371.52]*20,
'Number of hydrogen bond acceptors_provided': [2]*20,
'Number of hydrogen bond donors_provided': [0]*20,
'Number of atoms_provided': [28]*20,
'Number of bonds_provided': [30]*20,
'Number of rotable bonds_provided': [8]*20,
'Molecular refractivity': [119.72]*20,
'Topological Polar Surface Area': [12.47]*20,
'octanolwater partition coefficientlogP': [6]*20,
'Predicted LD50: 1190mgkg': [1190]*20,
'Predicted Toxicity Class': [4]*20,
'Hepatotoxicity_PRO': [0.69]*20,
'Neurotoxicity_PRO': [0.87]*20,
'Nephrotoxicity_PRO': [0.90]*20,
'Respiratory toxicity_PRO': [0.98]*20,
'Cardiotoxicity': [0.77]*20,
'Carcinogenicity_PRO': [0.62]*20,
'Immunotoxicity_PRO': [0.96]*20,
'Mutagenicity_PRO': [0.97]*20,
'Cytotoxicity_PRO': [0.93]*20,
'BBBbarrier': [1.0]*20,
'Ecotoxicity': [0.73]*20,
'Clinical toxicity': [0.56]*20,
'Nutritional toxicity': [0.74]*20,
'Aryl hydrocarbon ReceptorAhR': [0.97]*20,
'Androgen ReceptorAR': [0.99]*20,
'Androgen Receptor Ligand Binding DomainARLBD': [0.99]*20,
'Aromatase': [1.0]*20,
'Estrogen Receptor AlphaER': [0.99]*20,
'Estrogen Receptor Ligand Binding DomainERLBD': [1.0]*20,
'Peroxisome Proliferator Activated Receptor GammaPPARGamma': [0.99]*20,
'Nuclear factorerythroidderived 2like 2antioxidant responsive elementnrf2ARE': [0.88]*20,
'Heat shock factor response elementHSE': [0.88]*20,
'Mitochondrial Membrane PotentialMMP': [0.70]*20,
'Phosphoprotein Tumor Supressor p53': [0.96]*20,
'ATPase family AAA domaincontaining protein 5ATAD5': [0.99]*20,
'Thyroid hormone receptor alphaTHRα': [0.90]*20,
'Thyroid hormone receptor betaTHRβ': [0.78]*20,
'TranstyretrinTTR': [0.97]*20,
'Ryanodine receptorRYR': [0.98]*20,
'GABA receptorGABAR': [0.96]*20,
'Glutamate NmethylDaspartate receptorNMDAR': [0.92]*20,
'alphaamino3hydroxy5methyl4isoxazolepropionate receptorAMPAR': [0.97]*20,
'Kainate receptorKAR': [0.99]*20,
'AchetylcholinesteraseAChE': [0.69]*20,
'Constitutive androstane receptorCAR': [0.98]*20,
'Pregnane X receptorPXR': [0.92]*20,
'NADHquinone oxidoreductaseNADHOX': [0.97]*20,
'Voltage gated sodium channelVGSC': [0.95]*20,
'NaI symporterNIS': [0.98]*20,
'Cytochrome CYP1A2': [0.76]*20,
'Cytochrome CYP2C19': [0.87]*20,
'Cytochrome CYP2C9': [0.56]*20,
'Cytochrome CYP2D6': [0.63]*20,
'Cytochrome CYP3A4': [0.71]*20,
'Cytochrome CYP2E1': [0.98]*20
}
df = pd.DataFrame(data)
admet_df = pd.DataFrame(admet_data)
df = pd.concat([df, admet_df], axis=1)
df = pd.merge(df, binding_affinity_df, on='Model', how='left')
df['Distance-1 ExcludeAliphatic']= [0.839159] * 20
df['Distance-1 ExcludeAromatic']= [0.839159] * 20
#Chain the data, so you see the right parameters
X = df.drop('Calculated affinity (kcal/mol)', axis=1)
y = df['Calculated affinity (kcal/mol)']
#Code to make predictions for this model.
imputer = KNNImputer(n_neighbors=5)
X = imputer.fit_transform(X)
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=42)
#Test model: This code will help you to create different data too!
model = Ridge(alpha=1.0)
model.fit(X_train, y_train)
#Model created predict new results
y_pred = model.predict(X_test)
#Test: Now you can create and check where it is high quality!
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("R-squared:", r2)
print("MAE:", mae)
print("RMSE:", rmse)
#Finalized and high quality Production for AI
kf = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = cross_val_score(model, X_scaled, y, cv=kf, scoring='r2')
print("Cross-validation R-squared scores:", cv_scores)
print("Mean cross-validation R-squared score:", np.mean(cv_scores))
Drug Discovery © 2025 by Devise Foundation is licensed under CC BY-NC-ND 4.0
No comments:
Post a Comment