In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import gc
np.seterr(divide='ignore', invalid='ignore')
from warnings import simplefilter
simplefilter(action="ignore", category=pd.errors.PerformanceWarning)

In [2]:
from rdkit import Chem
from rdkit import DataStructs
from rdkit import RDConfig
from rdkit.Chem.Fingerprints import ClusterMols, DbFpSupplier, MolSimilarity, SimilarityScreener
from rdkit.Chem.Fingerprints import FingerprintMols as fp
from rdkit.Chem import AllChem, rdmolops, Lipinski, Descriptors
from rdkit.Chem.Descriptors import ExactMolWt, HeavyAtomMolWt, MolWt 
from rdkit.Chem.AllChem import GetMorganFingerprintAsBitVect
from rdkit.DataStructs.cDataStructs import ConvertToNumpyArray
from rdkit.Avalon.pyAvalonTools import GetAvalonFP 

In [3]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.keras.regularizers import l2
from tensorflow.keras.optimizers import Adam
from tensorflow.keras import regularizers

from sklearn.model_selection import train_test_split
from sklearn.metrics import r2_score,mean_absolute_error,mean_squared_error

In [4]:
gpus = tf.config.experimental.list_physical_devices('GPU')
if gpus:
 try:
 tf.config.experimental.set_memory_growth(gpus[0], True)
 except RuntimeError as e:
 print(e)

In [5]:
data_ws = pd.read_csv('./data/ws496_logS.csv')
data_ws['SMILES'] = pd.Series(data_ws['SMILES'], dtype="string")
smiles_ws = data_ws.iloc[:,1]
y_ws = data_ws.iloc[:,2]

data_delaney = pd.read_csv('./data/delaney-processed.csv')
data_delaney['smiles'] = pd.Series(data_delaney['smiles'], dtype="string")
smiles_de = data_delaney.iloc[:,-1]
y_de= data_delaney.iloc[:,1]

data_lovric2020 = pd.read_csv('./data/Lovric2020_logS0.csv')
data_lovric2020['isomeric_smiles'] = pd.Series(data_lovric2020['isomeric_smiles'], dtype="string")
smiles_lo = data_lovric2020.iloc[:,0]
y_lo = data_lovric2020.iloc[:,1]

data_huuskonen = pd.read_csv('./data/huusk.csv')
data_huuskonen['SMILES'] = pd.Series(data_huuskonen['SMILES'], dtype="string")
smiles_hu = data_huuskonen.iloc[:,4]
y_hu = data_huuskonen.iloc[:,-1].astype('float')

y_ws_nponly = y_ws.to_numpy()
y_de_nponly = y_de.to_numpy()
y_lo_nponly = y_lo.to_numpy()
y_hu_nponly = y_hu.to_numpy()

In [6]:
def mol3d_conv(mol):
 for i in mol: 
 Chem.AssignAtomChiralTagsFromStructure(i)
 AllChem.EmbedMolecule(i, useExpTorsionAnglePrefs=True,useBasicKnowledge=True)
 _ = Chem.MolToMolBlock(i,confId=-1)
 return mol

def mol3d_conv2(mol):
 for i in mol:
 AllChem.Compute2DCoords(i)
 input = Chem.AddHs(i)
 ps = AllChem.ETKDGv2()
 ps.randomSeed = 0xf00d
 AllChem.EmbedMolecule(input,ps)
 return mol

def conformer_idf(func, mol):
 arr=[]
 for i in mol:
 if i.GetNumConformers() == 1:
 res = np.asarray(func(i)).astype('float')
 arr.append(res)
 elif i.GetNumConformers() == 0:
 arr.append(0.0)
 else:
 print(f"Every molecule must have at most 1 conformer!")
 return arr

In [7]:
def fp_converter(data):
 LEN_OF_FF = 2048
 mols = [Chem.MolFromSmiles(data) for data in data]
 ECFP = [AllChem.GetMorganFingerprintAsBitVect(data, 2, nBits=LEN_OF_FF) for data in mols]
 MACCS = [Chem.rdMolDescriptors.GetMACCSKeysFingerprint(data) for data in mols]
 AvalonFP = [GetAvalonFP(data) for data in mols]

 ECFP_container = []
 MACCS_container = []
 AvalonFP_container=AvalonFP
 for fps in ECFP:
 arr = np.zeros((1,), dtype=int)
 DataStructs.ConvertToNumpyArray(fps, arr)
 ECFP_container.append(arr) 
 
 for fps2 in MACCS:
 arr2 = np.zeros((1,), dtype=int)
 DataStructs.ConvertToNumpyArray(fps2, arr2)
 MACCS_container.append(arr2)
 
 ECFP_container = np.asarray(ECFP_container)
 MACCS_container = np.asarray(MACCS_container)
 AvalonFP_container = np.asarray(AvalonFP_container) 
 return mols,ECFP_container, MACCS_container, AvalonFP_container

In [8]:
# mol_ws, x_ws, MACCS_ws, AvalonFP_ws = fp_converter(smiles_ws)
# mol_de, x_de, MACCS_de, AvalonFP_de = fp_converter(smiles_de)
# mol_lo, x_lo, MACCS_lo, AvalonFP_lo = fp_converter(smiles_lo)
# mol_hu, x_hu, MACCS_hu, AvalonFP_hu = fp_converter(smiles_hu)

# group_nws = np.concatenate([x_ws,MACCS_ws,AvalonFP_ws], axis=1)
# group_nde = np.concatenate([x_de,MACCS_de,AvalonFP_de], axis=1)
# group_nlo = np.concatenate([x_lo,MACCS_lo,AvalonFP_lo], axis=1)
# group_nhu = np.concatenate([x_hu,MACCS_hu,AvalonFP_hu], axis=1)

In [9]:
def search_data_origin(features,fps,mols,name): 
 phase1 = features[0] # "MolWeight"
 phase2 = features[1] # "Mol_MR"
 phase3 = features[2] # "Mol_TPSA"
 phase4 = features[3] # "Mol_logP"
 phase5 = features[4] # "RotatedBonds"
 phase6 = features[5] # "HeavyAtom"
 phase7 = features[6] # "numHAcceptor"
 phase8 = features[7] # "numHDoner"
 phase9 = features[8] # "numHeteroatom"
 phase10 = features[9] # "NumValenceElec"
 phase11 = features[10] # "NHOHCount"
 phase12 = features[11] # "NOCount"
 phase13 = features[12] # "Ringcount"
 phase14 = features[13] # "numAromaticR"
 phase15 = features[14] # "numSaturateR"
 phase16 = features[15] # "numAliphaticR"
 phase17 = features[16] # "LabuteASA"
 phase18 = features[17] # "BalabanJs"
 phase19 = features[18] # "BertzCTs"
 phase20 = features[19] # "ipc"
 phase21 = features[20] # "kappa_Series[1-3]"
 phase22 = features[21] # "Chi_Series[13]"
 phase23 = features[22] # "phi"
 phase24 = features[23] # "HallKierAlpha"
 phase25 = features[24] # "NumAmideBonds"
 phase26 = features[25] # "FractionCSP3"
 phase27 = features[26] # "NumSpiroAtoms"
 phase28 = features[27] # "NumBridgeheadAtoms"
 phase29 = features[28] # "PEOE_VSA_Series[1-14]"
 phase30 = features[29] # "SMR_VSA_Series[1-10]"
 phase31 = features[30] # "SlogP_VSA_Series[1-12]"
 phase32 = features[31] # "EState_VSA_Series[1-11]"
 phase33 = features[32] # "VSA_EState_Series[1-10]"
 phase34 = features[33] # "Asphericity"
 phase35 = features[34] # "PBF"
 phase36 = features[35] # "PMI_series[1-3]"
 phase37 = features[36] # "NPR_series[1-2]"
 phase38 = features[37] # "RadiusOfGyration"
 phase39 = features[38] # "InertialShapeFactor"
 phase40 = features[39] # "Eccentricity"
 phase41 = features[40] # "SpherocityIndex"
 phase42 = features[41] # "MQNs"
 phase43 = features[42] # "AUTOCORR2D"
 phase44 = features[43] # "BCUT2D", 
 phase45 = features[44] # "AUTOCORR3D"
 phase46 = features[45] # "RDF"
 phase47 = features[46] # "MORSE"
 phase48 = features[47] # "WHIM"
 phase49 = features[48] # "GETAWAY"
 ##############
 if phase1 == 1:
 descriptor = [ExactMolWt(mols) for mols in mols]
 descriptor = np.asarray(descriptor).astype('float')
 fps = np.concatenate([fps,descriptor[:,None]], axis=1)
 del descriptor 
 if phase2 == 1:
 descriptor = [Chem.Crippen.MolMR (mols) for mols in mols]
 descriptor = np.asarray(descriptor).astype('float')
 fps = np.concatenate([fps,descriptor[:,None]], axis=1)
 del descriptor 
 if phase3 == 1:
 descriptor = [Descriptors.TPSA(mols) for mols in mols]
 descriptor = np.asarray(descriptor).astype('float')
 fps = np.concatenate([fps,descriptor[:,None]], axis=1)
 del descriptor 
 if phase4 == 1:
 descriptor = [Chem.Crippen.MolLogP(mols) for mols in mols]
 descriptor = np.asarray(descriptor).astype('float')
 fps = np.concatenate([fps,descriptor[:,None]], axis=1)
 del descriptor 
 if phase5 == 1:
 descriptor = [Chem.Lipinski.NumRotatableBonds(mols) for mols in mols]
 descriptor = np.asarray(descriptor).astype('float')
 fps = np.concatenate([fps,descriptor[:,None]], axis=1)
 del descriptor
 if phase6 == 1:
 descriptor = [Chem.Lipinski.HeavyAtomCount(mols) for mols in mols]
 descriptor = np.asarray(descriptor).astype('float')
 fps = np.concatenate([fps,descriptor[:,None]], axis=1)
 del descriptor
 if phase7 == 1:
 descriptor = [Chem.Lipinski.NumHAcceptors(mols) for mols in mols]
 descriptor = np.asarray(descriptor).astype('float')
 fps = np.concatenate([fps,descriptor[:,None]], axis=1)
 del descriptor 
 if phase8 == 1:
 descriptor = [Chem.Lipinski.NumHDonors(mols) for mols in mols]
 descriptor = np.asarray(descriptor).astype('float')
 fps = np.concatenate([fps,descriptor[:,None]], axis=1)
 del descriptor 
 if phase9 == 1:
 descriptor = [Chem.Lipinski.NumHeteroatoms(mols) for mols in mols]
 descriptor = np.asarray(descriptor).astype('float')
 fps = np.concatenate([fps,descriptor[:,None]], axis=1)
 del descriptor 
 if phase10 == 1:
 descriptor = [Chem.Descriptors.NumValenceElectrons(mols) for mols in mols]
 descriptor = np.asarray(descriptor).astype('float')
 fps = np.concatenate([fps,descriptor[:,None]], axis=1)
 del descriptor
 if phase11 == 1:
 descriptor = [Chem.Lipinski.NHOHCount(mols) for mols in mols]
 descriptor = np.asarray(descriptor).astype('float')
 fps = np.concatenate([fps,descriptor[:,None]], axis=1)
 del descriptor
 if phase12 == 1:
 descriptor = [Chem.Lipinski.NOCount(mols) for mols in mols]
 descriptor = np.asarray(descriptor).astype('float')
 fps = np.concatenate([fps,descriptor[:,None]], axis=1)
 del descriptor 
 if phase13 == 1:
 descriptor = [Chem.Lipinski.RingCount(mols) for mols in mols]
 descriptor = np.asarray(descriptor).astype('float')
 fps = np.concatenate([fps,descriptor[:,None]], axis=1)
 del descriptor 
 if phase14 == 1:
 descriptor = [Chem.Lipinski.NumAromaticRings(mols) for mols in mols]
 descriptor = np.asarray(descriptor).astype('float')
 fps = np.concatenate([fps,descriptor[:,None]], axis=1)
 del descriptor 
 if phase15 == 1:
 descriptor = [Chem.Lipinski.NumSaturatedRings(mols) for mols in mols]
 descriptor = np.asarray(descriptor).astype('float')
 fps = np.concatenate([fps,descriptor[:,None]], axis=1)
 del descriptor
 if phase16 == 1:
 descriptor = [Chem.Lipinski.NumAliphaticRings(mols) for mols in mols]
 descriptor = np.asarray(descriptor).astype('float')
 fps = np.concatenate([fps,descriptor[:,None]], axis=1)
 del descriptor
 if phase17 == 1:
 descriptor = [Chem.rdMolDescriptors.CalcLabuteASA(mols) for mols in mols]
 descriptor = np.asarray(descriptor).astype('float')
 fps = np.concatenate([fps,descriptor[:,None]], axis=1)
 del descriptor 
 if phase18 == 1:
 descriptor = [Chem.GraphDescriptors.BalabanJ(mols) for mols in mols]
 descriptor = np.asarray(descriptor).astype('float')
 fps = np.concatenate([fps,descriptor[:,None]], axis=1)
 del descriptor 
 if phase19 == 1:
 descriptor = [Chem.GraphDescriptors.BertzCT(mols) for mols in mols]
 descriptor = np.asarray(descriptor).astype('float')
 fps = np.concatenate([fps,descriptor[:,None]], axis=1)
 del descriptor
 if phase20 == 1:
 descriptor = [Chem.GraphDescriptors.Ipc(alpha) for alpha in mols]
 descriptor = conformer_idf(Chem.GraphDescriptors.Ipc, mols)
 descriptor = np.log1p(descriptor)
 descriptor = np.nan_to_num(descriptor, nan=0.0)
 fps = np.concatenate([fps,descriptor[:,None]], axis=1)
 del descriptor
 if phase21 == 1:
 kappa1 = [Chem.GraphDescriptors.Kappa1(mols) for mols in mols]
 kappa2 = [Chem.GraphDescriptors.Kappa2(mols) for mols in mols]
 kappa3 = [Chem.GraphDescriptors.Kappa3(mols) for mols in mols]
 kappa1 = np.asarray(kappa1).astype('float')
 kappa2 = np.asarray(kappa2).astype('float')
 kappa3 = np.asarray(kappa3).astype('float')
 fps = np.concatenate([fps,kappa1[:,None],kappa2[:,None],kappa3[:,None]], axis=1)
 del kappa1,kappa2,kappa3
 if phase22 == 1:
 def values_chiN(mols):
 list_char=[]
 i=0
 while(1):
 if Chem.GraphDescriptors.ChiNn_(mols,i)==0.0:
 break
 list_char.append(Chem.GraphDescriptors.ChiNn_(mols,i))
 i+=1
 res = np.array(list_char)
 return res
 def values_chiV(mols):
 list_char=[]
 i=0
 while(1):
 if Chem.GraphDescriptors.ChiNv_(mols,i)==0.0:
 break
 list_char.append(Chem.GraphDescriptors.ChiNv_(mols,i))
 i+=1
 res = np.array(list_char)
 return res
 Chi0 = [Chem.GraphDescriptors.Chi0(mols) for mols in mols]
 Chi0n = [Chem.GraphDescriptors.Chi0n(mols) for mols in mols]
 Chi0v = [Chem.GraphDescriptors.Chi0v(mols) for mols in mols]
 Chi1 = [Chem.GraphDescriptors.Chi1(mols) for mols in mols]
 Chi1n = [Chem.GraphDescriptors.Chi1n(mols) for mols in mols]
 Chi1v = [Chem.GraphDescriptors.Chi1v(mols) for mols in mols]
 Chi2n = [Chem.GraphDescriptors.Chi2n(mols) for mols in mols]
 Chi2v = [Chem.GraphDescriptors.Chi2v(mols) for mols in mols]
 Chi3n = [Chem.GraphDescriptors.Chi3n(mols) for mols in mols]
 Chi3v = [Chem.GraphDescriptors.Chi3v(mols) for mols in mols]
 Chi4n = [Chem.GraphDescriptors.Chi4n(mols) for mols in mols]
 Chi4v = [Chem.GraphDescriptors.Chi4v(mols) for mols in mols]
 max_num1 = 0
 max_num2 = 0
 ChiNn_ = [values_chiN(alpha) for alpha in mols]
 ChiNv_ = [values_chiV(alpha) for alpha in mols]
 Chi0 = np.asarray(Chi0 ).astype('float')
 Chi0n = np.asarray(Chi0n).astype('float')
 Chi0v = np.asarray(Chi0v).astype('float')
 Chi1 = np.asarray(Chi1 ).astype('float')
 Chi1n = np.asarray(Chi1n).astype('float')
 Chi1v = np.asarray(Chi1v).astype('float')
 Chi2n = np.asarray(Chi2n).astype('float')
 Chi2v = np.asarray(Chi2v).astype('float')
 Chi3n = np.asarray(Chi3n).astype('float')
 Chi3v = np.asarray(Chi3v).astype('float')
 Chi4n = np.asarray(Chi4n).astype('float')
 Chi4v = np.asarray(Chi4v).astype('float')
 ChiNn_ = [np.resize(alpha, max_num1) for alpha in ChiNn_]
 ChiNv_ = [np.resize(alpha, max_num2) for alpha in ChiNv_]
 fps = np.concatenate([fps,Chi0[:,None],
 Chi0n[:,None],
 Chi0v[:,None],
 Chi1[:,None],
 Chi1n[:,None],
 Chi1v[:,None],
 Chi2n[:,None],
 Chi2v[:,None],
 Chi3n[:,None],
 Chi3v[:,None],
 Chi4n[:,None],
 Chi4v[:,None],
 ChiNn_,
 ChiNv_
 ], axis=1)
 fps = np.concatenate([fps,ChiNn_], axis=1)
 fps = np.concatenate([fps,ChiNv_], axis=1)
 del Chi0,Chi0n,Chi0v,Chi1,Chi1n,Chi1v,Chi2n,Chi2v,Chi3n,Chi3v,Chi4n,Chi4v,ChiNn_,ChiNv_
 if phase23 == 1:
 descriptor = [Chem.rdMolDescriptors.CalcPhi(mols) for mols in mols]
 descriptor = np.asarray(descriptor).astype('float')
 fps = np.concatenate([fps,descriptor[:,None]], axis=1)
 del descriptor
 if phase24 == 1:
 descriptor = [Chem.GraphDescriptors.HallKierAlpha(mols) for mols in mols]
 descriptor = np.asarray(descriptor).astype('float')
 fps = np.concatenate([fps,descriptor[:,None]], axis=1)
 del descriptor 
 if phase25 == 1:
 descriptor = [Chem.rdMolDescriptors.CalcNumAmideBonds(mols) for mols in mols]
 descriptor = np.asarray(descriptor).astype('float')
 fps = np.concatenate([fps,descriptor[:,None]], axis=1)
 del descriptor 
 if phase26 == 1:
 descriptor = [Chem.Lipinski.FractionCSP3(mols) for mols in mols]
 descriptor = np.asarray(descriptor).astype('float')
 fps = np.concatenate([fps,descriptor[:,None]], axis=1)
 del descriptor 
 if phase27 == 1:
 descriptor = [Chem.rdMolDescriptors.CalcNumSpiroAtoms(mols) for mols in mols]
 descriptor = np.asarray(descriptor).astype('float')
 fps = np.concatenate([fps,descriptor[:,None]], axis=1)
 del descriptor
 if phase28 == 1:
 descriptor = [Chem.rdMolDescriptors.CalcNumBridgeheadAtoms(mols) for mols in mols]
 descriptor = np.asarray(descriptor).astype('float')
 fps = np.concatenate([fps,descriptor[:,None]], axis=1)
 del descriptor
 ####
 if phase29 == 1:
 PEOE_VSA1 = [Chem.MolSurf.PEOE_VSA1(mols) for mols in mols]
 PEOE_VSA2 = [Chem.MolSurf.PEOE_VSA2(mols) for mols in mols]
 PEOE_VSA3 = [Chem.MolSurf.PEOE_VSA3(mols) for mols in mols]
 PEOE_VSA4 = [Chem.MolSurf.PEOE_VSA4(mols) for mols in mols]
 PEOE_VSA5 = [Chem.MolSurf.PEOE_VSA5(mols) for mols in mols]
 PEOE_VSA6 = [Chem.MolSurf.PEOE_VSA6(mols) for mols in mols]
 PEOE_VSA7 = [Chem.MolSurf.PEOE_VSA7(mols) for mols in mols]
 PEOE_VSA8 = [Chem.MolSurf.PEOE_VSA8(mols) for mols in mols]
 PEOE_VSA9 = [Chem.MolSurf.PEOE_VSA9(mols) for mols in mols]
 PEOE_VSA10 = [Chem.MolSurf.PEOE_VSA10(mols) for mols in mols]
 PEOE_VSA11 = [Chem.MolSurf.PEOE_VSA11(mols) for mols in mols]
 PEOE_VSA12 = [Chem.MolSurf.PEOE_VSA12(mols) for mols in mols]
 PEOE_VSA13 = [Chem.MolSurf.PEOE_VSA13(mols) for mols in mols]
 PEOE_VSA14 = [Chem.MolSurf.PEOE_VSA14(mols) for mols in mols]
 PEOE_VSA1 = np.asarray(PEOE_VSA1).astype('float')
 PEOE_VSA2 = np.asarray(PEOE_VSA2).astype('float')
 PEOE_VSA3 = np.asarray(PEOE_VSA3).astype('float')
 PEOE_VSA4 = np.asarray(PEOE_VSA4).astype('float')
 PEOE_VSA5 = np.asarray(PEOE_VSA5).astype('float')
 PEOE_VSA6 = np.asarray(PEOE_VSA6).astype('float')
 PEOE_VSA7 = np.asarray(PEOE_VSA7).astype('float')
 PEOE_VSA8 = np.asarray(PEOE_VSA8).astype('float')
 PEOE_VSA9 = np.asarray(PEOE_VSA9).astype('float')
 PEOE_VSA10 = np.asarray(PEOE_VSA10).astype('float')
 PEOE_VSA11 = np.asarray(PEOE_VSA11).astype('float')
 PEOE_VSA12 = np.asarray(PEOE_VSA12).astype('float')
 PEOE_VSA13 = np.asarray(PEOE_VSA13).astype('float')
 PEOE_VSA14 = np.asarray(PEOE_VSA14).astype('float')
 fps = np.concatenate([fps,PEOE_VSA1[:,None],
 PEOE_VSA2[:,None],
 PEOE_VSA3[:,None],
 PEOE_VSA4[:,None],
 PEOE_VSA5[:,None],
 PEOE_VSA6[:,None],
 PEOE_VSA7[:,None],
 PEOE_VSA8[:,None],
 PEOE_VSA9[:,None],
 PEOE_VSA10[:,None],
 PEOE_VSA11[:,None],
 PEOE_VSA12[:,None],
 PEOE_VSA13[:,None],
 PEOE_VSA14[:,None]], axis=1)
 del PEOE_VSA1,PEOE_VSA2,PEOE_VSA3,PEOE_VSA4,PEOE_VSA5,PEOE_VSA6,PEOE_VSA7,PEOE_VSA8,PEOE_VSA9,PEOE_VSA10,PEOE_VSA11,PEOE_VSA12,PEOE_VSA13,PEOE_VSA14
 ########
 if phase30 == 1:
 SMR_VSA1 = [Chem.MolSurf.SMR_VSA1(mols) for mols in mols]
 SMR_VSA2 = [Chem.MolSurf.SMR_VSA2(mols) for mols in mols]
 SMR_VSA3 = [Chem.MolSurf.SMR_VSA3(mols) for mols in mols]
 SMR_VSA4 = [Chem.MolSurf.SMR_VSA4(mols) for mols in mols]
 SMR_VSA5 = [Chem.MolSurf.SMR_VSA5(mols) for mols in mols]
 SMR_VSA6 = [Chem.MolSurf.SMR_VSA6(mols) for mols in mols]
 SMR_VSA7 = [Chem.MolSurf.SMR_VSA7(mols) for mols in mols]
 SMR_VSA8 = [Chem.MolSurf.SMR_VSA8(mols) for mols in mols]
 SMR_VSA9 = [Chem.MolSurf.SMR_VSA9(mols) for mols in mols]
 SMR_VSA10 = [Chem.MolSurf.SMR_VSA10(mols) for mols in mols]
 SMR_VSA1 = np.asarray(SMR_VSA1 ).astype('float')
 SMR_VSA2 = np.asarray(SMR_VSA2 ).astype('float')
 SMR_VSA3 = np.asarray(SMR_VSA3 ).astype('float')
 SMR_VSA4 = np.asarray(SMR_VSA4 ).astype('float')
 SMR_VSA5 = np.asarray(SMR_VSA5 ).astype('float')
 SMR_VSA6 = np.asarray(SMR_VSA6 ).astype('float')
 SMR_VSA7 = np.asarray(SMR_VSA7 ).astype('float')
 SMR_VSA8 = np.asarray(SMR_VSA8 ).astype('float')
 SMR_VSA9 = np.asarray(SMR_VSA9 ).astype('float')
 SMR_VSA10 = np.asarray(SMR_VSA10).astype('float')
 fps = np.concatenate([fps,SMR_VSA1[:,None],
 SMR_VSA2[:,None],
 SMR_VSA3[:,None],
 SMR_VSA4[:,None],
 SMR_VSA5[:,None],
 SMR_VSA6[:,None],
 SMR_VSA7[:,None],
 SMR_VSA8[:,None],
 SMR_VSA9[:,None],
 SMR_VSA10[:,None]], axis=1)
 del SMR_VSA1,SMR_VSA2,SMR_VSA3,SMR_VSA4,SMR_VSA5,SMR_VSA6,SMR_VSA7,SMR_VSA8,SMR_VSA9,SMR_VSA10
 ########
 if phase31 == 1:
 SlogP_VSA1 = [Chem.MolSurf.SlogP_VSA1(mols) for mols in mols]
 SlogP_VSA2 = [Chem.MolSurf.SlogP_VSA2(mols) for mols in mols]
 SlogP_VSA3 = [Chem.MolSurf.SlogP_VSA3(mols) for mols in mols]
 SlogP_VSA4 = [Chem.MolSurf.SlogP_VSA4(mols) for mols in mols]
 SlogP_VSA5 = [Chem.MolSurf.SlogP_VSA5(mols) for mols in mols]
 SlogP_VSA6 = [Chem.MolSurf.SlogP_VSA6(mols) for mols in mols]
 SlogP_VSA7 = [Chem.MolSurf.SlogP_VSA7(mols) for mols in mols]
 SlogP_VSA8 = [Chem.MolSurf.SlogP_VSA8(mols) for mols in mols]
 SlogP_VSA9 = [Chem.MolSurf.SlogP_VSA9(mols) for mols in mols]
 SlogP_VSA10 = [Chem.MolSurf.SlogP_VSA10(mols) for mols in mols]
 SlogP_VSA11 = [Chem.MolSurf.SlogP_VSA11(mols) for mols in mols]
 SlogP_VSA12 = [Chem.MolSurf.SlogP_VSA12(mols) for mols in mols]
 SlogP_VSA1 = np.asarray(SlogP_VSA1).astype('float')
 SlogP_VSA2 = np.asarray(SlogP_VSA2).astype('float')
 SlogP_VSA3 = np.asarray(SlogP_VSA3).astype('float')
 SlogP_VSA4 = np.asarray(SlogP_VSA4).astype('float')
 SlogP_VSA5 = np.asarray(SlogP_VSA5).astype('float')
 SlogP_VSA6 = np.asarray(SlogP_VSA6).astype('float')
 SlogP_VSA7 = np.asarray(SlogP_VSA7).astype('float')
 SlogP_VSA8 = np.asarray(SlogP_VSA8).astype('float')
 SlogP_VSA9 = np.asarray(SlogP_VSA9).astype('float')
 SlogP_VSA10 = np.asarray(SlogP_VSA10).astype('float')
 SlogP_VSA11 = np.asarray(SlogP_VSA11).astype('float')
 SlogP_VSA12 = np.asarray(SlogP_VSA12).astype('float')
 fps = np.concatenate([fps,SlogP_VSA1[:,None],
 SlogP_VSA2[:,None],
 SlogP_VSA3[:,None],
 SlogP_VSA4[:,None],
 SlogP_VSA5[:,None],
 SlogP_VSA6[:,None],
 SlogP_VSA7[:,None],
 SlogP_VSA8[:,None],
 SlogP_VSA9[:,None],
 SlogP_VSA10[:,None],
 SlogP_VSA11[:,None],
 SlogP_VSA12[:,None]], axis=1)
 del SlogP_VSA1,SlogP_VSA2,SlogP_VSA3,SlogP_VSA4,SlogP_VSA5,SlogP_VSA6,SlogP_VSA7,SlogP_VSA8,SlogP_VSA9,SlogP_VSA10,SlogP_VSA11,SlogP_VSA12
 ########
 if phase32 == 1:
 EState_VSA1 = [Chem.EState.EState_VSA.EState_VSA1(mols) for mols in mols]
 EState_VSA2 = [Chem.EState.EState_VSA.EState_VSA2(mols) for mols in mols]
 EState_VSA3 = [Chem.EState.EState_VSA.EState_VSA3(mols) for mols in mols]
 EState_VSA4 = [Chem.EState.EState_VSA.EState_VSA4(mols) for mols in mols]
 EState_VSA5 = [Chem.EState.EState_VSA.EState_VSA5(mols) for mols in mols]
 EState_VSA6 = [Chem.EState.EState_VSA.EState_VSA6(mols) for mols in mols]
 EState_VSA7 = [Chem.EState.EState_VSA.EState_VSA7(mols) for mols in mols]
 EState_VSA8 = [Chem.EState.EState_VSA.EState_VSA8(mols) for mols in mols]
 EState_VSA9 = [Chem.EState.EState_VSA.EState_VSA9(mols) for mols in mols]
 EState_VSA10 = [Chem.EState.EState_VSA.EState_VSA10(mols) for mols in mols]
 EState_VSA11 = [Chem.EState.EState_VSA.EState_VSA11(mols) for mols in mols]
 EState_VSA1 = np.asarray(EState_VSA1).astype('float')
 EState_VSA2 = np.asarray(EState_VSA2).astype('float')
 EState_VSA3 = np.asarray(EState_VSA3).astype('float')
 EState_VSA4 = np.asarray(EState_VSA4).astype('float')
 EState_VSA5 = np.asarray(EState_VSA5).astype('float')
 EState_VSA6 = np.asarray(EState_VSA6).astype('float')
 EState_VSA7 = np.asarray(EState_VSA7).astype('float')
 EState_VSA8 = np.asarray(EState_VSA8).astype('float')
 EState_VSA9 = np.asarray(EState_VSA9).astype('float')
 EState_VSA10 = np.asarray(EState_VSA10).astype('float')
 EState_VSA11 = np.asarray(EState_VSA11).astype('float')
 fps = np.concatenate([fps,EState_VSA1[:,None],
 EState_VSA2[:,None],
 EState_VSA3[:,None],
 EState_VSA4[:,None],
 EState_VSA5[:,None],
 EState_VSA6[:,None],
 EState_VSA7[:,None],
 EState_VSA8[:,None],
 EState_VSA9[:,None],
 EState_VSA10[:,None],
 EState_VSA11[:,None]], axis=1)
 del EState_VSA1,EState_VSA2,EState_VSA3,EState_VSA4,EState_VSA5,EState_VSA6,EState_VSA7,EState_VSA8,EState_VSA9,EState_VSA10,EState_VSA11
 ########
 if phase33 == 1:
 VSA_EState1 = [Chem.EState.EState_VSA.VSA_EState1(mols) for mols in mols]
 VSA_EState2 = [Chem.EState.EState_VSA.VSA_EState2(mols) for mols in mols]
 VSA_EState3 = [Chem.EState.EState_VSA.VSA_EState3(mols) for mols in mols]
 VSA_EState4 = [Chem.EState.EState_VSA.VSA_EState4(mols) for mols in mols]
 VSA_EState5 = [Chem.EState.EState_VSA.VSA_EState5(mols) for mols in mols]
 VSA_EState6 = [Chem.EState.EState_VSA.VSA_EState6(mols) for mols in mols]
 VSA_EState7 = [Chem.EState.EState_VSA.VSA_EState7(mols) for mols in mols]
 VSA_EState8 = [Chem.EState.EState_VSA.VSA_EState8(mols) for mols in mols]
 VSA_EState9 = [Chem.EState.EState_VSA.VSA_EState9(mols) for mols in mols]
 VSA_EState10 = [Chem.EState.EState_VSA.VSA_EState10(mols) for mols in mols]
 VSA_EState1 = np.asarray(VSA_EState1).astype('float')
 VSA_EState2 = np.asarray(VSA_EState2).astype('float')
 VSA_EState3 = np.asarray(VSA_EState3).astype('float')
 VSA_EState4 = np.asarray(VSA_EState4).astype('float')
 VSA_EState5 = np.asarray(VSA_EState5).astype('float')
 VSA_EState6 = np.asarray(VSA_EState6).astype('float')
 VSA_EState7 = np.asarray(VSA_EState7).astype('float')
 VSA_EState8 = np.asarray(VSA_EState8).astype('float')
 VSA_EState9 = np.asarray(VSA_EState9).astype('float')
 VSA_EState10 = np.asarray(VSA_EState10).astype('float')
 fps = np.concatenate([fps,VSA_EState1[:,None],
 VSA_EState2[:,None],
 VSA_EState3[:,None],
 VSA_EState4[:,None],
 VSA_EState5[:,None],
 VSA_EState6[:,None],
 VSA_EState7[:,None],
 VSA_EState8[:,None],
 VSA_EState9[:,None],
 VSA_EState10[:,None]], axis=1)
 del VSA_EState1,VSA_EState2,VSA_EState3,VSA_EState4,VSA_EState5,VSA_EState6,VSA_EState7,VSA_EState8,VSA_EState9,VSA_EState10
 #######################################################
 #######################################################
 # 3D Descriptors
 #
 # mol3d2=mol3d_conv(mols)
 mol3d=mol3d_conv2(mols)
 #######################################################
 #######################################################
 if phase34 == 1:
 descriptor = [Chem.rdMolDescriptors.CalcAsphericity(mol3d) for mol3d in mol3d]
 descriptor = np.asarray(descriptor).astype('float')
 fps = np.concatenate([fps,descriptor[:,None]], axis=1)
 del descriptor
 if phase35 == 1:
 descriptor = conformer_idf(Chem.rdMolDescriptors.CalcPBF,mol3d)
 descriptor = np.asarray(descriptor).astype('float')
 fps = np.concatenate([fps,descriptor[:,None]], axis=1)
 del descriptor
 if phase36 == 1:
 pmi1 = [Chem.rdMolDescriptors.CalcPMI1(mol3d) for mol3d in mol3d]
 pmi2 = [Chem.rdMolDescriptors.CalcPMI2(mol3d) for mol3d in mol3d]
 pmi3 = [Chem.rdMolDescriptors.CalcPMI3(mol3d) for mol3d in mol3d]
 pmi1 = np.asarray(pmi1).astype('float')
 pmi2 = np.asarray(pmi2).astype('float')
 pmi3 = np.asarray(pmi3).astype('float')
 pmi1 = np.log1p(pmi1)
 pmi1 = np.nan_to_num(pmi1, nan=0.0)
 pmi2 = np.log1p(pmi2)
 pmi2 = np.nan_to_num(pmi2, nan=0.0)
 pmi3 = np.log1p(pmi3)
 pmi3 = np.nan_to_num(pmi3, nan=0.0)
 fps = np.concatenate([fps,pmi1[:,None],pmi2[:,None],pmi3[:,None]], axis=1)
 del pmi1,pmi2,pmi3
 if phase37 == 1:
 npr1 = [Chem.rdMolDescriptors.CalcNPR1(mol3d) for mol3d in mol3d]
 npr2 = [Chem.rdMolDescriptors.CalcNPR2(mol3d) for mol3d in mol3d]
 npr1 = np.asarray(npr1).astype('float')
 npr2 = np.asarray(npr2).astype('float')
 fps = np.concatenate([fps,npr1[:,None],npr2[:,None]], axis=1)
 del npr1,npr2 
 if phase38 == 1:
 descriptor = [Chem.rdMolDescriptors.CalcRadiusOfGyration(mol3d) for mol3d in mol3d]
 descriptor = np.asarray(descriptor).astype('float')
 fps = np.concatenate([fps,descriptor[:,None]], axis=1)
 del descriptor
 if phase39 == 1:
 descriptor = [Chem.rdMolDescriptors.CalcInertialShapeFactor(mol3d) for mol3d in mol3d]
 descriptor = np.asarray(descriptor).astype('float')
 fps = np.concatenate([fps,descriptor[:,None]], axis=1)
 del descriptor
 if phase40 == 1:
 descriptor = [Chem.rdMolDescriptors.CalcEccentricity(mol3d) for mol3d in mol3d]
 descriptor = np.asarray(descriptor).astype('float')
 fps = np.concatenate([fps,descriptor[:,None]], axis=1)
 del descriptor
 if phase41 == 1:
 descriptor = conformer_idf(Chem.rdMolDescriptors.CalcSpherocityIndex,mol3d)
 descriptor = np.asarray(descriptor).astype('float')
 fps = np.concatenate([fps,descriptor[:,None]], axis=1)
 del descriptor
 if phase42 == 1:
 descriptor = [Chem.rdMolDescriptors.MQNs_(mols) for mols in mol3d]
 descriptor = np.asarray(descriptor).astype('float')
 fps = np.concatenate([fps,descriptor], axis=1)
 del descriptor
 if phase43 == 1:
 descriptor = [Chem.rdMolDescriptors.CalcAUTOCORR2D(mols) for mols in mol3d]
 descriptor = np.asarray(descriptor).astype('float')
 descriptor = np.log1p(descriptor+0.0001)
 descriptor = np.nan_to_num(descriptor, nan=0)
 fps = np.concatenate([fps,descriptor], axis=1)
 del descriptor
 if phase44 == 1:
 descriptor = [Chem.rdMolDescriptors.CalcAUTOCORR3D(mols) for mols in mol3d]
 descriptor = np.asarray(descriptor).astype('float')
 descriptor = np.log1p(descriptor+0.0001)
 descriptor = np.nan_to_num(descriptor, nan=0)
 fps = np.concatenate([fps,descriptor], axis=1)
 del descriptor
 if phase45 == 1:
 descriptor = [Chem.rdMolDescriptors.CalcRDF(mol3d) for mol3d in mol3d]
 descriptor = np.asarray(descriptor).astype('float')
 fps = np.concatenate([fps,descriptor], axis=1)
 del descriptor
 if phase46 == 1:
 try:
 descriptor = [Chem.rdMolDescriptors.BCUT2D(mols) for mols in mol3d]
 except ValueError as e:
 print(f"BCUT2D is not working with {e}")
 descriptor=[]
 for i in mol3d:
 try:
 descriptor.append(Chem.rdMolDescriptors.BCUT2D(i))
 except:
 print(f"Error with : {Chem.MolToSmiles(i)}")
 descriptor.append([0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0])
 descriptor = np.asarray(descriptor).astype('float')
 descriptor = np.log1p(descriptor+0.0001)
 descriptor = np.nan_to_num(descriptor, nan=0)
 fps = np.concatenate([fps,descriptor], axis=1)
 del descriptor
 if phase47 == 1:
 descriptor = [Chem.rdMolDescriptors.CalcMORSE(mol3d) for mol3d in mol3d]
 descriptor = np.asarray(descriptor).astype('float')
 descriptor = np.log1p(descriptor+0.0001)
 descriptor = np.nan_to_num(descriptor, nan=0)
 fps = np.concatenate([fps,descriptor], axis=1)
 del descriptor
 if phase48 == 1:
 descriptor = [Chem.rdMolDescriptors.CalcWHIM(mol3d) for mol3d in mol3d]
 descriptor = np.asarray(descriptor).astype('float')
 fps = np.concatenate([fps,descriptor], axis=1)
 del descriptor
 if phase49 == 1:
 descriptor = [Chem.rdMolDescriptors.CalcGETAWAY(mols) for mols in mol3d]
 descriptor = np.asarray(descriptor).astype('float')
 descriptor = np.log1p(descriptor+0.0001)
 descriptor = np.nan_to_num(descriptor, nan=0)
 fps = np.concatenate([fps,descriptor], axis=1)
 del descriptor
 fps = np.nan_to_num(fps, nan=0.0)
 return fps, _

In [10]:
ws_input_fea=[
 1, #phase1 "MolWeight" 
 1, #phase2 "Mol_MR" 
 1, #phase3 "Mol_TPSA" 
 1, #phase4 "Mol_logP" 
 1, #phase5 "RotatedBonds" 
 1, #phase6 "HeavyAtom" 
 0, #phase7 "numHAcceptor" 
 0, #phase8 "numHDoner" 
 0, #phase9 "numHeteroatom" 
 1, #phase10 "NumValenceElec" 
 1, #phase11 "NHOHCount" 
 1, #phase12 "NOCount" 
 0, #phase13 "Ringcount" 
 1, #phase14 "numAromaticR" 
 0, #phase15 "numSaturateR" 
 0, #phase16 "numAliphaticR" 
 0, #phase17 "LabuteASA" 
 1, #phase18 "BalabanJs" 
 1, #phase19 "BertzCTs" 
 0, #phase20 "ipc", 
 0, #phase21 "kappa_Series[1-3]" 
 1, #phase22 "Chi_Series[13]" 
 1, #phase23 "phi" 
 0, #phase24 "HallKierAlpha" 
 0, #phase25 "NumAmideBonds" 
 1, #phase26 "FractionCSP3" 
 0, #phase27 "NumSpiroAtoms" 
 1, #phase28 "NumBridgeheadAtoms" 
 1, #phase29 "PEOE_VSA_Series[1-14]" 
 1, #phase30 "SMR_VSA_Series[1-10]" 
 0, #phase31 "SlogP_VSA_Series[1-12]" 
 1, #phase32 "EState_VSA_Series[1-11]" 
 0, #phase33 "VSA_EState_Series[1-10]" 
 0, #phase34 "Asphericity" 
 1, #phase35 "PBF" 
 1, #phase36 "PMI_series[1-3]" 
 0, #phase37 "NPR_series[1-2]" 
 0, #phase38 "RadiusOfGyration" 
 0, #phase39 "InertialShapeFactor" 
 1, #phase40 "Eccentricity" 
 0, #phase41 "SpherocityIndex" 
 0, #phase42 "MQNs" 
 0, #phase43 "AUTOCORR2D" 
 1, #phase44 "BCUT2D", 
 0, #phase45 "AUTOCORR3D" 
 1, #phase46 "RDF" 
 0, #phase47 "MORSE" 
 1, #phase48 "WHIM" 
 0, #phase49 "GETAWAY" 
]

In [11]:
de_input_fea=[
 1, #phase1 "MolWeight"
 1, #phase2 "Mol_MR"
 1, #phase3 "Mol_TPSA"
 1, #phase4 "Mol_logP"
 0, #phase5 "RotatedBonds"
 0, #phase6 "HeavyAtom"
 1, #phase7 "numHAcceptor"
 1, #phase8 "numHDoner"
 0, #phase9 "numHeteroatom"
 0, #phase10 "NumValenceElec"
 1, #phase11 "NHOHCount"
 0, #phase12 "NOCount"
 0, #phase13 "Ringcount"
 0, #phase14 "numAromaticR"
 0, #phase15 "numSaturateR"
 1, #phase16 "numAliphaticR"
 1, #phase17 "LabuteASA"
 1, #phase18 "BalabanJs"
 1, #phase19 "BertzCTs"
 1, #phase20 "ipc"
 0, #phase21 "kappa_Series[1-3]"
 0, #phase22 "Chi_Series[13]"
 0, #phase23 "phi"
 1, #phase24 "HallKierAlpha"
 1, #phase25 "NumAmideBonds"
 1, #phase26 "FractionCSP3"
 1, #phase27 "NumSpiroAtoms"
 0, #phase28 "NumBridgeheadAtoms"
 1, #phase29 "PEOE_VSA_Series[1-14]"
 1, #phase30 "SMR_VSA_Series[1-10]"
 0, #phase31 "SlogP_VSA_Series[1-12]"
 0, #phase32 "EState_VSA_Series[1-11]"
 0, #phase33 "VSA_EState_Series[1-10]"
 1, #phase34 "Asphericity"
 0, #phase35 "PBF"
 0, #phase36 "PMI_series[1-3]"
 1, #phase37 "NPR_series[1-2]"
 0, #phase38 "RadiusOfGyration"
 0, #phase39 "InertialShapeFactor"
 0, #phase40 "Eccentricity"
 0, #phase41 "SpherocityIndex"
 0, #phase42 "MQNs"
 1, #phase43 "AUTOCORR2D"
 1, #phase44 "BCUT2D"
 0, #phase45 "AUTOCORR3D"
 1, #phase46 "RDF"
 0, #phase47 "MORSE"
 1, #phase48 "WHIM"
 0, #phase49 "GETAWAY"
]

In [12]:
lo_input_fea=[
 1, #phase1 "MolWeight"
 1, #phase2 "Mol_MR"
 1, #phase3 "Mol_TPSA"
 1, #phase4 "Mol_logP"
 1, #phase5 "RotatedBonds"
 0, #phase6 "HeavyAtom"
 0, #phase7 "numHAcceptor"
 0, #phase8 "numHDoner"
 1, #phase9 "numHeteroatom"
 1, #phase10 "NumValenceElec"
 1, #phase11 "NHOHCount"
 1, #phase12 "NOCount"
 0, #phase13 "Ringcount"
 1, #phase14 "numAromaticR"
 0, #phase15 "numSaturateR"
 0, #phase16 "numAliphaticR"
 0, #phase17 "LabuteASA"
 1, #phase18 "BalabanJs"
 0, #phase19 "BertzCTs"
 0, #phase20 "ipc"
 1, #phase21 "kappa_Series[1-3]"
 0, #phase22 "Chi_Series[13]"
 1, #phase23 "phi"
 1, #phase24 "HallKierAlpha"
 0, #phase25 "NumAmideBonds"
 1, #phase26 "FractionCSP3"
 1, #phase27 "NumSpiroAtoms"
 0, #phase28 "NumBridgeheadAtoms"
 1, #phase29 "PEOE_VSA_Series[1-14]"
 1, #phase30 "SMR_VSA_Series[1-10]"
 1, #phase31 "SlogP_VSA_Series[1-12]"
 0, #phase32 "EState_VSA_Series[1-11]"
 1, #phase33 "VSA_EState_Series[1-10]"
 1, #phase34 "Asphericity"
 0, #phase35 "PBF"
 0, #phase36 "PMI_series[1-3]"
 1, #phase37 "NPR_series[1-2]"
 1, #phase38 "RadiusOfGyration"
 0, #phase39 "InertialShapeFactor"
 0, #phase40 "Eccentricity"
 1, #phase41 "SpherocityIndex"
 0, #phase42 "MQNs"
 0, #phase43 "AUTOCORR2D"
 0, #phase44 "BCUT2D"
 0, #phase45 "AUTOCORR3D"
 1, #phase46 "RDF"
 0, #phase47 "MORSE"
 0, #phase48 "WHIM"
 0, #phase49 "GETAWAY"
]

In [13]:
hu_input_fea=[
 1, #phase1 "MolWeight"
 1, #phase2 "Mol_MR"
 1, #phase3 "Mol_TPSA"
 1, #phase4 "Mol_logP"
 0, #phase5 "RotatedBonds"
 1, #phase6 "HeavyAtom"
 0, #phase7 "numHAcceptor"
 1, #phase8 "numHDoner"
 1, #phase9 "numHeteroatom"
 1, #phase10 "NumValenceElec"
 0, #phase11 "NHOHCount"
 1, #phase12 "NOCount"
 1, #phase13 "Ringcount"
 1, #phase14 "numAromaticR"
 1, #phase15 "numSaturateR"
 0, #phase16 "numAliphaticR"
 0, #phase17 "LabuteASA"
 1, #phase18 "BalabanJs"
 1, #phase19 "BertzCTs"
 1, #phase20 "ipc"
 0, #phase21 "kappa_Series[1-3]"
 1, #phase22 "Chi_Series[13]"
 1, #phase23 "phi"
 0, #phase24 "HallKierAlpha"
 1, #phase25 "NumAmideBonds"
 0, #phase26 "FractionCSP3"
 1, #phase27 "NumSpiroAtoms"
 0, #phase28 "NumBridgeheadAtoms"
 1, #phase29 "PEOE_VSA_Series[1-14]"
 1, #phase30 "SMR_VSA_Series[1-10]"
 1, #phase31 "SlogP_VSA_Series[1-12]"
 1, #phase32 "EState_VSA_Series[1-11]"
 1, #phase33 "VSA_EState_Series[1-10]"
 1, #phase34 "Asphericity"
 1, #phase35 "PBF"
 1, #phase36 "PMI_series[1-3]"
 1, #phase37 "NPR_series[1-2]"
 1, #phase38 "RadiusOfGyration"
 1, #phase39 "InertialShapeFactor"
 0, #phase40 "Eccentricity"
 0, #phase41 "SpherocityIndex"
 1, #phase42 "MQNs"
 1, #phase43 "AUTOCORR2D"
 1, #phase44 "BCUT2D"
 1, #phase45 "AUTOCORR3D"
 1, #phase46 "RDF"
 0, #phase47 "MORSE"
 0, #phase48 "WHIM"
 0, #phase49 "GETAWAY"
]

In [14]:
try:
 new_ws = pd.read_csv("new_ws_final.csv").to_numpy()
 new_de = pd.read_csv("new_de_final.csv").to_numpy()
 new_lo = pd.read_csv("new_lo_final.csv").to_numpy()
 new_hu = pd.read_csv("new_hu_final.csv").to_numpy()
 new_ws = np.nan_to_num(new_ws, nan=0.0)
 new_de = np.nan_to_num(new_de, nan=0.0)
 new_lo = np.nan_to_num(new_lo, nan=0.0)
 new_hu = np.nan_to_num(new_hu, nan=0.0)
except:
 new_ws, pd_names_new_ws = search_data_origin(ws_input_fea, group_nws, mol_ws, 'ws')
 new_de, pd_names_new_de = search_data_origin(de_input_fea, group_nde, mol_de, 'de')
 new_lo, pd_names_new_lo = search_data_origin(lo_input_fea, group_nlo, mol_lo, 'lo')
 new_hu, pd_names_new_hu = search_data_origin(hu_input_fea, group_nhu, mol_hu, 'hu')
 pd.DataFrame(new_ws).to_csv("new_ws_final.csv",index=False)
 pd.DataFrame(new_de).to_csv("new_de_final.csv",index=False)
 pd.DataFrame(new_lo).to_csv("new_lo_final.csv",index=False)
 pd.DataFrame(new_hu).to_csv("new_hu_final.csv",index=False)
 new_ws = np.nan_to_num(new_ws, nan=0.0)
 new_de = np.nan_to_num(new_de, nan=0.0)
 new_lo = np.nan_to_num(new_lo, nan=0.0)
 new_hu = np.nan_to_num(new_hu, nan=0.0)

In [15]:
BATCHSIZE = 16
EPOCHS = 1000
lr = 0.0001
decay = 1e-4

In [16]:
def ws_model():
 decay = 1e-4
 model = tf.keras.Sequential()
 model.add(
 tf.keras.layers.Dense(
 7897,
 activation="relu",
 kernel_initializer='glorot_uniform',
 kernel_regularizer=tf.keras.regularizers.l2(decay),
 )
 )
 model.add(Dropout(rate=0.1))
 model.add(
 tf.keras.layers.Dense(
 9994,
 activation="relu",
 kernel_initializer='glorot_uniform',
 kernel_regularizer=tf.keras.regularizers.l2(decay),
 )
 )
 model.add(Dropout(rate=0.1))
 model.add(Dense(units=1))
 model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr), 
 loss='mse', metrics=['mse', 'mae',tf.keras.metrics.RootMeanSquaredError()])
 return model

In [17]:
def de_model():
 decay = 1e-4
 model = tf.keras.Sequential()
 model.add(
 tf.keras.layers.Dense(
 4882,
 activation="relu",
 kernel_initializer='glorot_uniform',
 kernel_regularizer=tf.keras.regularizers.l2(decay),
 )
 )
 model.add(Dropout(rate=0.1))
 model.add(Dense(units=1))
 model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr), 
 loss='mse', metrics=['mse', 'mae',tf.keras.metrics.RootMeanSquaredError()])
 return model

In [18]:
def lo_model():
 decay = 1e-5
 model = tf.keras.Sequential()
 model.add(
 tf.keras.layers.Dense(
 6365,
 activation="relu",
 kernel_initializer='glorot_uniform',
 kernel_regularizer=tf.keras.regularizers.l2(decay),
 )
 )
 model.add(Dropout(rate=0.1))
 model.add(
 tf.keras.layers.Dense(
 9298,
 activation="relu",
 kernel_initializer='glorot_uniform',
 kernel_regularizer=tf.keras.regularizers.l2(decay),
 )
 )
 model.add(Dropout(rate=0.1))
 model.add(Dense(units=1))
 model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr), 
 loss='mse', metrics=['mse', 'mae',tf.keras.metrics.RootMeanSquaredError()])
 return model

In [19]:
def hu_model():
 decay1 = 1e-4
 model = tf.keras.Sequential()
 model.add(
 tf.keras.layers.Dense(
 6325,
 activation="relu",
 kernel_initializer='glorot_uniform',
 kernel_regularizer=tf.keras.regularizers.l2(decay1),
 )
 )
 model.add(Dropout(rate=0.1))
 model.add(Dense(units=1))
 model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=lr), 
 loss='mse', metrics=['mse', 'mae',tf.keras.metrics.RootMeanSquaredError()])
 return model

In [20]:
xtr_fws, xte_fws, ytr_fws, yte_fws = train_test_split(new_ws, y_ws_nponly, test_size = 0.1, random_state = 42)
xtr_fde, xte_fde, ytr_fde, yte_fde = train_test_split(new_de, y_de_nponly, test_size = 0.1, random_state = 42)
xtr_flo, xte_flo, ytr_flo, yte_flo = train_test_split(new_lo, y_lo_nponly, test_size = 0.1, random_state = 42)
xtr_fhu, xte_fhu, ytr_fhu, yte_fhu = train_test_split(new_hu, y_hu_nponly, test_size = 0.1, random_state = 42)

In [21]:
ws_url = "./save_model/{}_model_{}batch_{}epoch_{}lr.h5".format('ws',BATCHSIZE,EPOCHS,lr)
de_url = "./save_model/{}_model_{}batch_{}epoch_{}lr.h5".format('de',BATCHSIZE,EPOCHS,lr)
lo_url = "./save_model/{}_model_{}batch_{}epoch_{}lr.h5".format('lo',BATCHSIZE,EPOCHS,lr)
hu_url = "./save_model/{}_model_{}batch_{}epoch_{}lr.h5".format('hu',BATCHSIZE,EPOCHS,lr)

In [22]:
cp_ws = tf.keras.callbacks.ModelCheckpoint(ws_url,monitor='val_loss',verbose=1, mode='auto') #,save_best_only=True)
cp_de = tf.keras.callbacks.ModelCheckpoint(de_url,monitor='val_loss',verbose=1, mode='auto') #,save_best_only=True)
cp_lo = tf.keras.callbacks.ModelCheckpoint(lo_url,monitor='val_loss',verbose=1, mode='auto') #,save_best_only=True)
cp_hu = tf.keras.callbacks.ModelCheckpoint(hu_url,monitor='val_loss',verbose=1, mode='auto') #,save_best_only=True)

In [23]:
# cb = tf.keras.callbacks.EarlyStopping(
# monitor="val_loss",
# patience=200,
# verbose=0,
# mode="auto"
# )

In [24]:
try:
 print(f"reading...ws")
 model_fws1 = tf.keras.models.load_model(ws_url)
 print(f"ws_model : {model_fws1}")
 print(f"Finished...ws\n")
except:
 print(f"Creating...ws")
 model_fws1 = hu_model()
 print(f"Finished...ws\n")
####################################################
####################################################
####################################################
try:
 print(f"reading...de")
 model_fde1 = tf.keras.models.load_model(de_url)
 print(f"de_model : {model_fde1}")
 print(f"Finished...de\n")
except:
 print(f"Creating...de")
 model_fde1 = hu_model()
 print(f"Finished...de\n")
####################################################
####################################################
####################################################
try:
 print(f"reading...lo")
 model_flo1 = tf.keras.models.load_model(lo_url)
 print(f"de_model : {model_flo1}")
 print(f"Finished...lo\n")
except:
 print(f"Creating...lo")
 model_flo1 = lo_model()
 print(f"Finished...lo\n")
####################################################
####################################################
####################################################
# try:
# print(f"reading...hu")
# model_fhu1 = tf.keras.models.load_model(hu_url)
# print(f"lo_model : {model_fhu1}")
# print(f"Finished...hu\n")
# except:
# print(f"Creating...hu")
# model_fhu1 = hu_model()
# print(f"Finished...hu\n")

reading...ws
ws_model : 
Finished...ws

reading...de
de_model : 
Finished...de

reading...lo
de_model : 
Finished...lo



In [None]:
# tf.keras.backend.clear_session() 
# model_fws1 = ws_model()
# model_fws1.fit(xtr_fws,ytr_fws,
# batch_size=BATCHSIZE,
# callbacks=[cp_ws],
# validation_split=0.1,
# epochs=EPOCHS,
# verbose=1,
# )
# model_fws1.save(f'./save_model/ws_manual_save_model_{BATCHSIZE}batch_{EPOCHS}epochs_{lr}lr.h5')
y_pred_search = model_fws1.predict(xte_fws, verbose=0)
score = r2_score(yte_fws, y_pred_search)
print(f"ws r2 score : {score}")
gc.collect()


ws r2 score : 0.9163043268412985


650

In [None]:
# tf.keras.backend.clear_session()
# model_fde1.fit(xtr_fde,ytr_fde,
# batch_size=BATCHSIZE,
# callbacks=[cp_de],
# validation_split=0.1,
# epochs=EPOCHS,
# verbose=1,
# )
# model_fde1.save(f'./save_model/de_manual_save_model_{BATCHSIZE}batch_{EPOCHS}epochs_{lr}lr.h5')
y_pred_search = model_fde1.predict(xte_fde, verbose=0)
y_pred_search= np.nan_to_num(y_pred_search, nan=0.0)
score = r2_score(yte_fde, y_pred_search)
print(f"de r2 score : {score}")
gc.collect()

de r2 score : 0.990616653856253


655

In [27]:
y_pred_search = model_flo1.predict(xte_flo, verbose=0)
score = r2_score(yte_flo, y_pred_search)
print(f"lo r2 score : {score}")
gc.collect()

lo r2 score : 0.7896706466730885


650

In [28]:
# print(f"reading...hu")
# model_fhu1 = tf.keras.models.load_model(hu_url)
# print(f"lo_model : {model_fhu1}")
# print(f"Finished...hu\n")

In [29]:
# y_pred_search = model_fhu1.predict(xte_fhu, verbose=0)
# score = r2_score(yte_fhu, y_pred_search)
# print(f"hu r2 score : {score}")
# gc.collect()

In [None]:
# tf.keras.backend.clear_session()
# model_flo1.fit(xtr_flo,ytr_flo,
# batch_size=BATCHSIZE,
# callbacks=[cp_lo],
# validation_split=0.1,
# epochs=EPOCHS,
# verbose=1,
# )
# model_flo1.save(f'./save_model/lo_manual_save_model_{BATCHSIZE}batch_{lr}lr.h5')
# y_pred_search = model_flo1.predict(xte_flo, verbose=0)
# score = r2_score(yte_flo, y_pred_search)
# print(f"lo r2 score : {score}")
# gc.collect()

In [31]:
EPOCHS=1000

In [32]:
model_fhu1 = hu_model()

In [None]:
tf.keras.backend.clear_session()
model_fhu1.fit(xtr_fhu,ytr_fhu,
 batch_size=BATCHSIZE,
 callbacks=[cp_hu],
 # validation_split=0.1,
 epochs=EPOCHS,
 verbose=1,
)
model_fhu1.save(f'./save_model/hu_manual_save_model_{BATCHSIZE}batch_{lr}lr.h5')
y_pred_search = model_fhu1.predict(xte_fhu, verbose=0)
score = r2_score(yte_fhu, y_pred_search)
print(f"hu r2 score : {score}")
gc.collect()

Epoch 1/1000
Epoch 1: saving model to ./save_model\hu_model_16batch_1000epoch_0.0001lr.h5
Epoch 2/1000
Epoch 2: saving model to ./save_model\hu_model_16batch_1000epoch_0.0001lr.h5
Epoch 3/1000
Epoch 3: saving model to ./save_model\hu_model_16batch_1000epoch_0.0001lr.h5
Epoch 4/1000
Epoch 4: saving model to ./save_model\hu_model_16batch_1000epoch_0.0001lr.h5
Epoch 5/1000
Epoch 5: saving model to ./save_model\hu_model_16batch_1000epoch_0.0001lr.h5
Epoch 6/1000
Epoch 6: saving model to ./save_model\hu_model_16batch_1000epoch_0.0001lr.h5
Epoch 7/1000
Epoch 7: saving model to ./save_model\hu_model_16batch_1000epoch_0.0001lr.h5
Epoch 8/1000
Epoch 8: saving model to ./save_model\hu_model_16batch_1000epoch_0.0001lr.h5
Epoch 9/1000
Epoch 9: saving model to ./save_model\hu_model_16batch_1000epoch_0.0001lr.h5
Epoch 10/1000
Epoch 10: saving model to ./save_model\hu_model_16batch_1000epoch_0.0001lr.h5
Epoch 11/1000
Epoch 11: saving model to ./save_model\hu_model_16batch_1000epoch_0.0001lr.h5
Epoch 

2038

In [None]:
# from sklearn.model_selection import KFold
# split_num=4
# kf = KFold(n_splits=split_num)
# model_fws2 = ws_model()
# score_accum={}
# i=0

# tf.keras.backend.clear_session()
# for tr, te in kf.split(xtr_fws):
# xtr, xte = xtr_fws[tr], xtr_fws[te]
# ytr, yte = ytr_fws[tr], ytr_fws[te]
# model_fws2.fit(xtr,ytr,
# batch_size=BATCHSIZE,
# validation_split=0.1,
# epochs=EPOCHS,
# verbose=0,
# )
# y_pred_search = model_fws2.predict(xte, verbose=0)
# score = r2_score(yte, y_pred_search)
# score_accum['r2_valid{}'.format(i)]=score
# print(f"Finishied #{i+1} - {score}") 
# i+=1
# gc.collect()
# model_fws2.save("{}_model_cross_validation_{}batch_{}epoch_{}lr.h5".format('ws',BATCHSIZE,EPOCHS,lr))
# y_pred_search = model_fws2.predict(xte_fws, verbose=0)
# score = r2_score(yte_fws, y_pred_search)
# print(score_accum)
# print(f"Final {score}")

In [35]:
# res = 0
# for i,j in score_accum.items():
# res+=j
# res = res/4.0
# res

In [36]:
# %tensorboard --logdir logs/gradient_tape