CNN, MAS, OPTUNA
CODE:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv1D, MaxPooling1D, Flatten, Dense
import optuna
# Sample dataset (replace with your actual data)
data = {
'Name': ['Fenofibrate', 'Gemfibrozil', 'UDCA', 'Silybin', 'Colchicine', 'Pentoxifylline', 'Vitamin E', 'Myristic Acid', 'Palmitic Acid'],
'SMILES': [
'CN(C)C(=O)OC(C(F)(F)F)C(C(F)(F)F)C1=CC=CC=C1',
'CC(C)CC(C(C(O)=O)OC(C)C)OC(C)C',
'C([C@@H](CC1CC([C@@H]([C@H](O)C1)C)C)C)O',
'CC1(C(C2C(C3OC4C(C(C4OC3(C)C)OC(=O)C5=CC=CC=C5)OC2(C)OC1C6=CC=CC=C6)O)O)O',
'CN1C2=C(C=CC2=O)C(C3=C1NC4=C(C=CC=C4)S3)=O',
'CN(C)C(=O)N(C)C(C)C1=NN(C)C=N1',
'CC(C)CCCC(C)CCCC(C)CCCC(C)C1=CC=CC(C(O)=O)=C1',
'CCCCCCCCCCCCCC(O)=O',
'CCCCCCCCCCCCCCCC(O)=O'
],
'Activity': [1, 1, 1, 1, 1, 1, 1, -1, -1], # -1 indicates unknown activity
'MolecularWeight': [360.83, 250.33, 392.57, 482.44, 399.44, 278.31, 430.71, 228.37, 256.42],
'LogP': [5.2, 3.4, 3.0, 2.5, 1.3, 0.2, 10.0, 6.1, 7.1],
'TPSA': [52.6, 46.5, 60.69, 167.52, 67.43, 71.68, 29.46, 37.3, 37.3],
'RotatableBonds': [6, 8, 5, 5, 2, 2, 5, 12, 14],
'HBondDonors': [0, 1, 3, 5, 1, 0, 1, 2, 2],
'HBondAcceptors': [3, 3, 3, 11, 5, 4, 2, 2, 2]
}
df = pd.DataFrame(data)
# Preprocess data
descriptors = ['MolecularWeight', 'LogP', 'TPSA', 'RotatableBonds', 'HBondDonors', 'HBondAcceptors']
X = df[descriptors].values
y = df['Activity'].values
# Split known and unknown data
known_idx = np.where(y != -1)[0]
unknown_idx = np.where(y == -1)[0]
X_known = X[known_idx]
y_known = y[known_idx]
X_unknown = X[unknown_idx]
# Scale the data
scaler = StandardScaler()
X_known_scaled = scaler.fit_transform(X_known)
X_unknown_scaled = scaler.transform(X_unknown)
# Reshape for 1D CNN: (samples, time steps, features)
X_known_scaled = X_known_scaled.reshape(X_known_scaled.shape[0], X_known_scaled.shape[1], 1)
X_unknown_scaled = X_unknown_scaled.reshape(X_unknown_scaled.shape[0], X_unknown_scaled.shape[1], 1)
# Optuna objective function with fixed kernel size
def objective(trial):
model = Sequential()
num_layers = trial.suggest_int('num_layers', 1, 3)
current_size = X_known_scaled.shape[1] # Input size is 6 (time steps)
for i in range(num_layers):
filters = trial.suggest_int(f'filters_{i}', 16, 64)
# Fix: Ensure kernel_size is at least 1 and does not exceed current_size
kernel_size = trial.suggest_int(f'kernel_size_{i}', 1, max(2, min(5, current_size)))
# Use 'same' padding to maintain size
model.add(Conv1D(filters=filters, kernel_size=kernel_size, activation='relu', padding='same',
input_shape=(X_known_scaled.shape[1], 1) if i == 0 else None))
# Add pooling only if size permits
if current_size > 1:
model.add(MaxPooling1D(pool_size=2))
current_size = current_size // 2 # Approximate size reduction
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=trial.suggest_float('learning_rate', 1e-5, 1e-2)),
loss='binary_crossentropy', metrics=['accuracy'])
history = model.fit(X_known_scaled, y_known, epochs=10, validation_split=0.2, verbose=0)
return history.history['val_accuracy'][-1]
# Run Optuna optimization
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=10)
# Get best parameters
best_params = study.best_params
# Build and train the best model
def build_best_model(params):
model = Sequential()
current_size = X_known_scaled.shape[1]
for i in range(params['num_layers']):
model.add(Conv1D(filters=params[f'filters_{i}'], kernel_size=params[f'kernel_size_{i}'],
activation='relu', padding='same',
input_shape=(X_known_scaled.shape[1], 1) if i == 0 else None))
if current_size > 1:
model.add(MaxPooling1D(pool_size=2))
current_size = current_size // 2
model.add(Flatten())
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=params['learning_rate']),
loss='binary_crossentropy', metrics=['accuracy'])
return model
best_model = build_best_model(best_params)
best_model.fit(X_known_scaled, y_known, epochs=10, verbose=0)
# Predict unknown compounds
predictions = best_model.predict(X_unknown_scaled)
predicted_activity = (predictions > 0.5).astype(int).flatten()
df.loc[unknown_idx, 'Activity'] = predicted_activity
# Multi-Agent System (MAS) for drug discovery
class MASLangernetAutomations:
def __init__(self, df):
self.df = df
def data_pattern_agent(self):
print("Data Pattern Agent: Analyzing dataset patterns...")
print(self.df.describe())
def drug_design_agent(self):
print("Drug Design Agent: Predicted activities:")
print(self.df[['Name', 'Activity']])
def drug_formulation_agent(self):
print("Drug Formulation Agent: Suggesting formulations...")
for _, row in self.df.iterrows():
if row['LogP'] > 5:
print(f"{row['Name']}: High LogP, suggest lipid-based delivery.")
else:
print(f"{row['Name']}: Low LogP, suggest aqueous formulations.")
def drug_efficacy_agent(self):
print("Drug Efficacy Agent: Mapping efficacy...")
self.df['Efficacy'] = self.df['Activity'].apply(lambda x: 'High' if x == 1 else 'Low')
print(self.df[['Name', 'Efficacy']])
def run_pipeline(self):
self.data_pattern_agent()
self.drug_design_agent()
self.drug_formulation_agent()
self.drug_efficacy_agent()
# Run MAS pipeline
mas = MASLangernetAutomations(df)
mas.run_pipeline()
OUTPUT:
Data Pattern Agent: Analyzing dataset patterns... Activity MolecularWeight LogP TPSA RotatableBonds \ count 9.0 9.000000 9.000000 9.000000 9.000000 mean 1.0 342.157778 4.311111 63.386667 6.555556 std 0.0 91.172501 3.080765 41.607857 4.126473 min 1.0 228.370000 0.200000 29.460000 2.000000 25% 1.0 256.420000 2.500000 37.300000 5.000000 50% 1.0 360.830000 3.400000 52.600000 5.000000 75% 1.0 399.440000 6.100000 67.430000 8.000000 max 1.0 482.440000 10.000000 167.520000 14.000000 HBondDonors HBondAcceptors count 9.000000 9.000000 mean 1.666667 3.888889 std 1.581139 2.848001 min 0.000000 2.000000 25% 1.000000 2.000000 50% 1.000000 3.000000 75% 2.000000 4.000000 max 5.000000 11.000000 Drug Design Agent: Predicted activities: Name Activity 0 Fenofibrate 1 1 Gemfibrozil 1 2 UDCA 1 3 Silybin 1 4 Colchicine 1 5 Pentoxifylline 1 6 Vitamin E 1 7 Myristic Acid 1 8 Palmitic Acid 1 Drug Formulation Agent: Suggesting formulations... Fenofibrate: High LogP, suggest lipid-based delivery. Gemfibrozil: Low LogP, suggest aqueous formulations. UDCA: Low LogP, suggest aqueous formulations. Silybin: Low LogP, suggest aqueous formulations. Colchicine: Low LogP, suggest aqueous formulations. Pentoxifylline: Low LogP, suggest aqueous formulations. Vitamin E: High LogP, suggest lipid-based delivery. Myristic Acid: High LogP, suggest lipid-based delivery. Palmitic Acid: High LogP, suggest lipid-based delivery. Drug Efficacy Agent: Mapping efficacy... Name Efficacy 0 Fenofibrate High 1 Gemfibrozil High 2 UDCA High 3 Silybin High 4 Colchicine High 5 Pentoxifylline High 6 Vitamin E High 7 Myristic Acid High 8 Palmitic Acid High
AI-driven CLD, MAFLD diseases' drug discovery for demonstration purposes © 2025 by DEVISE FOUNDATION is licensed under CC BY-NC-ND 4.0
No comments:
Post a Comment