import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
# Original dataset
data_original = {
'Name': ['P_0', 'P_1', 'P_10', 'P_11', 'P_12', 'P_13', 'P_14', 'P_15', 'P_16', 'P_17', 'P_18', 'P_19', 'P_2', 'P_20', 'P_21', 'P_22', 'P_23', 'P_24', 'P_25', 'P_26', 'P_27', 'P_28', 'P_29', 'P_3', 'P_30', 'P_31', 'P_32', 'P_33', 'P_34', 'P_35'],
'Volume ų': [1964.45, 1780.27, 488.96, 375.4, 285.7, 249.36, 227.1, 223.24, 209.39, 206.21, 199.17, 181.23, 1230.68, 170.1, 150.57, 145.35, 136.72, 129.0, 120.37, 117.64, 117.19, 112.19, 109.24, 1155.51, 107.19, 105.38, 103.56, 101.52, 101.52, 100.15],
'Surface Ų': [2366.68, 2400.5, 833.79, 736.43, 409.39, 377.28, 556.29, 503.26, 376.45, 429.0, 309.26, 404.76, 1564.31, 474.34, 343.34, 347.46, 294.9, 277.52, 232.13, 254.77, 184.68, 194.29, 163.88, 1702.57, 305.14, 258.94, 193.35, 152.28, 242.22, 197.91],
'Drug Score': [0.8, 0.81, 0.8, 0.63, 0.42, 0.49, 0.39, 0.37, 0.43, 0.35, 0.52, 0.32, 0.81, 0.33, 0.29, 0.25, 0.22, 0.2, 0.19, 0.22, 0.19, 0.16, 0.2, 0.8, 0.17, 0.15, 0.18, 0.17, 0.2, 0.22],
'Binding Affinity': [5.2, 4.8, 6.1, 5.5, 7.2, 6.5, 5.8, 6.0, 5.9, 6.2, 5.1, 6.8, 4.9, 6.4, 6.6, 6.3, 5.7, 6.9, 5.4, 6.1, 5.6, 6.7, 5.3, 4.7, 6.5, 5.9, 6.0, 5.8, 5.2, 6.4],
'ADMET_Absorption': [0.8, 0.7, 0.9, 0.6, 0.5, 0.8, 0.7, 0.6, 0.9, 0.8, 0.7, 0.5, 0.9, 0.6, 0.8, 0.7, 0.6, 0.5, 0.8, 0.9, 0.7, 0.6, 0.8, 0.9, 0.7, 0.6, 0.8, 0.9, 0.7, 0.6],
'ADMET_Distribution': [0.9, 0.8, 0.7, 0.9, 0.6, 0.8, 0.9, 0.7, 0.8, 0.9, 0.7, 0.6, 0.8, 0.9, 0.7, 0.8, 0.9, 0.7, 0.8, 0.9, 0.7, 0.8, 0.9, 0.8, 0.9, 0.7, 0.8, 0.9, 0.8, 0.7],
'Toxicity': [0.1, 0.2, 0.1, 0.3, 0.2, 0.1, 0.2, 0.3, 0.1, 0.2, 0.3, 0.1, 0.2, 0.3, 0.1, 0.2, 0.3, 0.1, 0.2, 0.1, 0.3, 0.2, 0.1, 0.2, 0.3, 0.1, 0.2, 0.1, 0.3, 0.2],
}
df_original = pd.DataFrame(data_original)
df_original['Volume_to_Surface_Ratio'] = df_original['Volume ų'] / df_original['Surface Ų']
# Add new features
df_original['Molweight'] = [371.52] * len(df_original)
df_original['Number of hydrogen bond acceptors'] = [2] * len(df_original)
df_original['Number of hydrogen bond donors'] = [0] * len(df_original)
df_original['Number of atoms'] = [28] * len(df_original)
df_original['Number of bonds'] = [30] * len(df_original)
df_original['Number of rotable bonds'] = [8] * len(df_original)
df_original['Molecular refractivity'] = [119.72] * len(df_original)
df_original['Topological Polar Surface Area'] = [12.47] * len(df_original)
df_original['octanol/water partition coefficient(logP)'] = [6] * len(df_original)
df_original['Predicted LD50'] = [1190] * len(df_original)
df_original['Predicted Toxicity Class'] = [4] * len(df_original)
# Define features (X) and target (y)
X = df_original[['Volume ų', 'Surface Ų', 'Volume_to_Surface_Ratio', 'Binding Affinity', 'ADMET_Absorption', 'ADMET_Distribution', 'Toxicity', 'Molweight', 'Number of hydrogen bond acceptors', 'Number of hydrogen bond donors', 'Number of atoms', 'Number of bonds', 'Number of rotable bonds', 'Molecular refractivity', 'Topological Polar Surface Area', 'octanol/water partition coefficient(logP)', 'Predicted LD50', 'Predicted Toxicity Class']]
y = df_original['Drug Score']
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Hyperparameter tuning
param_grid = {
'n_estimators': [10, 50, 100, 200],
'max_depth': [None, 5, 10],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 5, 10]
}
grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train_scaled, y_train)
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Score: {-grid_search.best_score_}")
# Evaluate the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse}")
# Example prediction
example_input = pd.DataFrame({
'Volume ų': [1000],
'Surface Ų': [1500],
'Volume_to_Surface_Ratio': [1000/1500],
'Binding Affinity': [5.5],
'ADMET_Absorption': [0.8],
'ADMET_Distribution': [0.9],
'Toxicity': [0.2],
'Molweight': [371.52],
'Number of hydrogen bond acceptors': [2],
'Number of hydrogen bond donors': [0],
'Number of atoms': [28],
'Number of bonds': [30],
'Number of rotable bonds': [8],
'Molecular refractivity': [119.72],
'Topological Polar Surface Area': [12.47],
'octanol/water partition coefficient(logP)': [6],
'Predicted LD50': [1190],
'Predicted Toxicity Class': [4]
})
example_input_scaled = scaler.transform(example_input)
example_prediction = best_model.predict(example_input_scaled)
print(f"Example Prediction for Drug Score: {example_prediction[0]}")
OUTPUT:
Best Parameters: {'max_depth': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'n_estimators': 200}
Best Score: 0.01233636934999999
Mean Squared Error (MSE): 0.0011783112499999852
Example Prediction for Drug Score: 0.761600000000001
2.
CODE:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
# Original dataset
data_original = {
'Name': ['P_0', 'P_1', 'P_10', 'P_11', 'P_12', 'P_13', 'P_14', 'P_15', 'P_16', 'P_17', 'P_18', 'P_19', 'P_2', 'P_20', 'P_21', 'P_22', 'P_23', 'P_24', 'P_25', 'P_26', 'P_27', 'P_28', 'P_29', 'P_3', 'P_30', 'P_31', 'P_32', 'P_33', 'P_34', 'P_35'],
'Volume ų': [1964.45, 1780.27, 488.96, 375.4, 285.7, 249.36, 227.1, 223.24, 209.39, 206.21, 199.17, 181.23, 1230.68, 170.1, 150.57, 145.35, 136.72, 129.0, 120.37, 117.64, 117.19, 112.19, 109.24, 1155.51, 107.19, 105.38, 103.56, 101.52, 101.52, 100.15],
'Surface Ų': [2366.68, 2400.5, 833.79, 736.43, 409.39, 377.28, 556.29, 503.26, 376.45, 429.0, 309.26, 404.76, 1564.31, 474.34, 343.34, 347.46, 294.9, 277.52, 232.13, 254.77, 184.68, 194.29, 163.88, 1702.57, 305.14, 258.94, 193.35, 152.28, 242.22, 197.91],
'Drug Score': [0.8, 0.81, 0.8, 0.63, 0.42, 0.49, 0.39, 0.37, 0.43, 0.35, 0.52, 0.32, 0.81, 0.33, 0.29, 0.25, 0.22, 0.2, 0.19, 0.22, 0.19, 0.16, 0.2, 0.8, 0.17, 0.15, 0.18, 0.17, 0.2, 0.22],
'Binding Affinity': [5.2, 4.8, 6.1, 5.5, 7.2, 6.5, 5.8, 6.0, 5.9, 6.2, 5.1, 6.8, 4.9, 6.4, 6.6, 6.3, 5.7, 6.9, 5.4, 6.1, 5.6, 6.7, 5.3, 4.7, 6.5, 5.9, 6.0, 5.8, 5.2, 6.4],
'ADMET_Absorption': [0.8, 0.7, 0.9, 0.6, 0.5, 0.8, 0.7, 0.6, 0.9, 0.8, 0.7, 0.5, 0.9, 0.6, 0.8, 0.7, 0.6, 0.5, 0.8, 0.9, 0.7, 0.6, 0.8, 0.9, 0.7, 0.6, 0.8, 0.9, 0.7, 0.6],
'ADMET_Distribution': [0.9, 0.8, 0.7, 0.9, 0.6, 0.8, 0.9, 0.7, 0.8, 0.9, 0.7, 0.6, 0.8, 0.9, 0.7, 0.8, 0.9, 0.7, 0.8, 0.9, 0.7, 0.8, 0.9, 0.8, 0.9, 0.7, 0.8, 0.9, 0.8, 0.7],
'Toxicity': [0.1, 0.2, 0.1, 0.3, 0.2, 0.1, 0.2, 0.3, 0.1, 0.2, 0.3, 0.1, 0.2, 0.3, 0.1, 0.2, 0.3, 0.1, 0.2, 0.1, 0.3, 0.2, 0.1, 0.2, 0.3, 0.1, 0.2, 0.1, 0.3, 0.2],
}
df_original = pd.DataFrame(data_original)
df_original['Volume_to_Surface_Ratio'] = df_original['Volume ų'] / df_original['Surface Ų']
# Define features (X) and target (y)
X_original = df_original[['Volume ų', 'Surface Ų', 'Volume_to_Surface_Ratio', 'Binding Affinity', 'ADMET_Absorption', 'ADMET_Distribution', 'Toxicity']]
y_original = df_original['Drug Score']
# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_original, y_original, test_size=0.2, random_state=42)
# Scale features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)
# Hyperparameter tuning
param_grid = {
'n_estimators': [10, 50, 100, 200],
'max_depth': [None, 5, 10],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1, 5, 10]
}
grid_search = GridSearchCV(RandomForestRegressor(), param_grid, cv=5, scoring='neg_mean_squared_error')
grid_search.fit(X_train_scaled, y_train)
print(f"Best Parameters: {grid_search.best_params_}")
print(f"Best Score: {-grid_search.best_score_}")
# Evaluate the best model
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test_scaled)
mse = mean_squared_error(y_test, y_pred)
print(f"Mean Squared Error (MSE): {mse}")
# Example prediction
example_input = pd.DataFrame({
'Volume ų': [1000],
'Surface Ų': [1500],
'Volume_to_Surface_Ratio': [1000/1500],
'Binding Affinity': [5.5],
'ADMET_Absorption': [0.8],
'ADMET_Distribution': [0.9],
'Toxicity': [0.2]
})
example_input_scaled = scaler.transform(example_input)
example_prediction = best_model.predict(example_input_scaled)
print(f"Example Prediction for Drug Score: {example_prediction[0]}")
OUTPUT: