{ "cells": [ { "cell_type": "code", "execution_count": 13, "metadata": {}, "outputs": [], "source": [ "import os\n", "import sys\n", "import time\n", "import subprocess\n", "import logging\n", "import warnings\n", "import gc\n", "import numpy as np\n", "import pandas as pd\n", "import seaborn as sns\n", "import matplotlib.pyplot as plt\n", "import matplotlib.patches as mpatches\n", "from concurrent.futures import ProcessPoolExecutor, as_completed" ] }, { "cell_type": "code", "execution_count": 14, "metadata": {}, "outputs": [], "source": [ "from rdkit import Chem\n", "from rdkit.Chem import AllChem, DataStructs, Draw\n", "from rdkit import RDConfig\n", "from rdkit.Chem import Descriptors, rdMolDescriptors, Lipinski, rdDistGeom, rdPartialCharges\n", "from rdkit.Chem.AllChem import GetMorganGenerator\n", "from rdkit.DataStructs.cDataStructs import ConvertToNumpyArray\n", "from rdkit.Avalon.pyAvalonTools import GetAvalonFP" ] }, { "cell_type": "code", "execution_count": 15, "metadata": {}, "outputs": [], "source": [ "import tensorflow as tf\n", "from tensorflow import keras\n", "from tensorflow.keras import layers\n", "from tensorflow.keras.models import Sequential\n", "from tensorflow.keras.layers import Dense, Dropout, Activation\n", "from tensorflow.keras.regularizers import l2\n", "from tensorflow.keras.optimizers import Adam\n", "from tensorflow.keras import regularizers" ] }, { "cell_type": "code", "execution_count": 16, "metadata": {}, "outputs": [], "source": [ "from sklearn.model_selection import train_test_split\n", "from sklearn.linear_model import Ridge\n", "from sklearn.ensemble import RandomForestRegressor\n", "from sklearn.neural_network import MLPRegressor\n", "from sklearn.svm import SVR\n", "from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error, root_mean_squared_error" ] }, { "cell_type": "code", "execution_count": 17, "metadata": {}, "outputs": [], "source": [ "import optuna\n", "from optuna.trial import TrialState" ] }, { "cell_type": "code", "execution_count": 18, "metadata": {}, "outputs": [], "source": [ "tf.keras.backend.clear_session()\n", "gpus = tf.config.experimental.list_physical_devices('GPU')\n", "if gpus:\n", " try:\n", " for gpu in gpus:\n", " tf.config.experimental.set_memory_growth(gpu, True)\n", " except RuntimeError as e:\n", " print(e)" ] }, { "cell_type": "code", "execution_count": 19, "metadata": {}, "outputs": [], "source": [ "target_path = \"result/5_ANO_structure\"\n", "os.makedirs(target_path, exist_ok=True)" ] }, { "cell_type": "code", "execution_count": 20, "metadata": {}, "outputs": [], "source": [ "data_ws = pd.read_csv('./data/ws496_logS.csv', dtype={'SMILES': 'string'})\n", "smiles_ws = data_ws['SMILES']\n", "y_ws = data_ws.iloc[:, 2]\n", "\n", "data_delaney = pd.read_csv('./data/delaney-processed.csv', dtype={'smiles': 'string'})\n", "smiles_de = data_delaney['smiles']\n", "y_de = data_delaney.iloc[:, 1]\n", "\n", "data_lovric2020 = pd.read_csv('./data/Lovric2020_logS0.csv', dtype={'isomeric_smiles': 'string'})\n", "smiles_lo = data_lovric2020['isomeric_smiles']\n", "y_lo = data_lovric2020.iloc[:, 1]\n", "\n", "data_huuskonen = pd.read_csv('./data/huusk.csv', dtype={'SMILES': 'string'})\n", "smiles_hu = data_huuskonen['SMILES']\n", "y_hu = data_huuskonen.iloc[:, -1].astype('float')" ] }, { "cell_type": "code", "execution_count": 21, "metadata": {}, "outputs": [], "source": [ "def mol3d(mol):\n", " mol = Chem.AddHs(mol)\n", " optimization_methods = [\n", " (AllChem.EmbedMolecule, (mol, AllChem.ETKDGv3()), {}),\n", " (AllChem.UFFOptimizeMolecule, (mol,), {'maxIters': 200}),\n", " (AllChem.MMFFOptimizeMolecule, (mol,), {'maxIters': 200})\n", " ]\n", "\n", " for method, args, kwargs in optimization_methods:\n", " try:\n", " method(*args, **kwargs)\n", " if mol.GetNumConformers() > 0:\n", " return mol\n", " except ValueError as e:\n", " print(f\"Error: {e} - Trying next optimization method [{method}]\")\n", "\n", " print(f\"Invalid mol for 3d {'\\033[94m'}{Chem.MolToSmiles(mol)}{'\\033[0m'} - No conformer generated\")\n", " return None" ] }, { "cell_type": "code", "execution_count": 22, "metadata": {}, "outputs": [], "source": [ "def convert_smiles_to_mol(smiles, fail_folder=None, index=None, yvalue=None):\n", " mol = Chem.MolFromSmiles(smiles)\n", " if mol is None:\n", " print(f\"[convert_smiles_to_mol] Cannot convert {smiles} to Mols\")\n", " return None, {\"smiles\": smiles, \"y_value\": yvalue, \"error\": \"Invalid SMILES\"}\n", "\n", " try:\n", " Chem.Kekulize(mol, clearAromaticFlags=True)\n", " isomeric_smiles = Chem.MolToSmiles(mol, isomericSmiles=True)\n", " mol = Chem.MolFromSmiles(isomeric_smiles)\n", " except Exception as e:\n", " print(f\"[convert_smiles_to_mol] failed {smiles} isomeric_smiles by {e}\")\n", " if fail_folder and index is not None:\n", " img_path = os.path.join(fail_folder, f\"mol_{index}.png\")\n", " img = Draw.MolToImage(mol)\n", " img.save(img_path)\n", " return None, {\"smiles\": smiles, \"y_value\": yvalue, \"error\": f\"Isomeric SMILES error: {e}\"}\n", "\n", " try:\n", " Chem.SanitizeMol(mol)\n", " except Exception as e:\n", " print(f\"[convert_smiles_to_mol] failed {smiles} SanitizeMol by {e}\")\n", " if fail_folder and index is not None:\n", " img_path = os.path.join(fail_folder, f\"mol_{index}.png\")\n", " img = Draw.MolToImage(mol)\n", " img.save(img_path)\n", " return None, {\"smiles\": smiles, \"y_value\": yvalue, \"error\": f\"SanitizeMol error: {e}\"}\n", "\n", " return mol, None" ] }, { "cell_type": "code", "execution_count": 23, "metadata": {}, "outputs": [], "source": [ "def process_smiles(smiles, yvalue, fail_folder, index):\n", " mol, error = convert_smiles_to_mol(smiles, fail_folder, index, yvalue)\n", " if error:\n", " return None, None, error\n", "\n", " mol_3d = mol3d(mol)\n", " if mol_3d:\n", " return smiles, yvalue, None\n", " else:\n", " img_path = os.path.join(fail_folder, f\"mol_{index}.png\")\n", " img = Draw.MolToImage(mol)\n", " img.save(img_path)\n", " return None, None, {\"smiles\": smiles, \"y_value\": yvalue}\n", "\n", "def process_dataset(smiles_list, y_values, dataset_name, target_path=\"result\", max_workers=None):\n", " start = time.time()\n", " valid_smiles, valid_y = [], []\n", " error_smiles_list = []\n", " fail_folder = f\"{target_path}/failed/{dataset_name}\"\n", " os.makedirs(fail_folder, exist_ok=True)\n", "\n", " with ProcessPoolExecutor(max_workers=max_workers) as executor:\n", " futures = [\n", " executor.submit(process_smiles, smiles, yvalue, fail_folder, i)\n", " for i, (smiles, yvalue) in enumerate(zip(smiles_list, y_values))\n", " ]\n", " for future in as_completed(futures):\n", " smiles, yvalue, error = future.result()\n", " if error:\n", " error_smiles_list.append(error)\n", " elif smiles is not None and yvalue is not None:\n", " valid_smiles.append(smiles)\n", " valid_y.append(yvalue)\n", "\n", " if error_smiles_list:\n", " error_df = pd.DataFrame(error_smiles_list)\n", " error_df.to_csv(os.path.join(fail_folder, \"failed_smiles.csv\"), index=False)\n", " print(f\" [{dataset_name:<10}] : {time.time()-start:.4f} sec\")\n", " return valid_smiles, valid_y" ] }, { "cell_type": "code", "execution_count": 24, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ " [ws496 ] : 0.8649 sec\n", " [delaney ] : 1.3527 sec\n", "Error: Bad Conformer Id - Trying next optimization method []\n", "Error: Bad Conformer Id - Trying next optimization method []\n", "Invalid mol for 3d \u001b[94m[H]O[C@@]([H])(c1c([H])c([H])nc2c([H])c([H])c(OC([H])([H])[H])c([H])c12)[C@]1([H])[N@]2C([H])([H])C([H])([H])[C@@]([H])(C1([H])[H])[C@@]([H])(C([H])=C([H])[H])C2([H])[H]\u001b[0m - No conformer generated\n", "Error: Bad Conformer Id - Trying next optimization method []\n", "Error: Bad Conformer Id - Trying next optimization method []\n", "Invalid mol for 3d \u001b[94m[H]O[C@]([H])(c1c([H])c([H])nc2c([H])c([H])c(OC([H])([H])[H])c([H])c12)[C@@]1([H])[N@]2C([H])([H])C([H])([H])[C@@]([H])(C1([H])[H])[C@@]([H])(C([H])=C([H])[H])C2([H])[H]\u001b[0m - No conformer generated\n", " [Lovric2020_logS0] : 8.3057 sec\n", " [huusk ] : 1.5089 sec\n" ] } ], "source": [ "smiles_ws, y_ws = process_dataset(smiles_ws, y_ws, \"ws496\", target_path)\n", "smiles_de, y_de = process_dataset(smiles_de, y_de, \"delaney\", target_path)\n", "smiles_lo, y_lo = process_dataset(smiles_lo, y_lo, \"Lovric2020_logS0\", target_path)\n", "smiles_hu, y_hu = process_dataset(smiles_hu, y_hu, \"huusk\", target_path)" ] }, { "cell_type": "code", "execution_count": 25, "metadata": {}, "outputs": [], "source": [ "LEN_OF_FF = 2048\n", "LEN_OF_MA = 167\n", "LEN_OF_AV = 512" ] }, { "cell_type": "code", "execution_count": 26, "metadata": {}, "outputs": [], "source": [ "def get_fingerprints(mol):\n", " if mol is None:\n", " return None, None, None\n", " \n", " morgan_generator = GetMorganGenerator(radius=2, fpSize=LEN_OF_FF)\n", " ecfp = morgan_generator.GetFingerprint(mol)\n", " ecfp_array = np.zeros((LEN_OF_FF,),dtype=int)\n", " DataStructs.ConvertToNumpyArray(ecfp, ecfp_array)\n", " \n", " maccs = Chem.rdMolDescriptors.GetMACCSKeysFingerprint(mol)\n", "\n", " avalon_fp = GetAvalonFP(mol)\n", " avalon_array = np.zeros((LEN_OF_AV,),dtype=int)\n", " DataStructs.ConvertToNumpyArray(avalon_fp, avalon_array)\n", " \n", " return ecfp_array, maccs, avalon_array\n", "\n", "def fp_converter(data, use_parallel=True):\n", " mols = [Chem.MolFromSmiles(smi) for smi in data]\n", " \n", " if use_parallel:\n", " try: \n", " with ProcessPoolExecutor() as executor:\n", " results = list(executor.map(get_fingerprints, mols))\n", " except Exception as e:\n", " print(f\"Parallel processing failed due to: {e}. Falling back to sequential processing.\")\n", " use_parallel = False\n", " \n", " if not use_parallel:\n", " results = [get_fingerprints(mol) for mol in mols]\n", " \n", " ECFP, MACCS, AvalonFP = zip(*results)\n", " \n", " ECFP_container = np.vstack([arr for arr in ECFP if arr is not None])\n", " MACCS_container = np.zeros((len(MACCS), LEN_OF_MA), dtype=int)\n", " AvalonFP_container = np.vstack([arr for arr in AvalonFP if arr is not None])\n", "\n", " for i, fp in enumerate(MACCS):\n", " if fp is not None:\n", " DataStructs.ConvertToNumpyArray(fp, MACCS_container[i])\n", " \n", " return mols, ECFP_container, MACCS_container, AvalonFP_container" ] }, { "cell_type": "code", "execution_count": 27, "metadata": {}, "outputs": [], "source": [ "mol_ws, x_ws, MACCS_ws, AvalonFP_ws = fp_converter(smiles_ws,target_path)\n", "mol_de, x_de, MACCS_de, AvalonFP_de = fp_converter(smiles_de,target_path)\n", "mol_lo, x_lo, MACCS_lo, AvalonFP_lo = fp_converter(smiles_lo,target_path)\n", "mol_hu, x_hu, MACCS_hu, AvalonFP_hu = fp_converter(smiles_hu,target_path)" ] }, { "cell_type": "code", "execution_count": 28, "metadata": {}, "outputs": [], "source": [ "def concatenate_to_numpy(*dataframes):\n", " numpy_arrays = [df.to_numpy() if isinstance(df, pd.DataFrame) else df for df in dataframes]\n", " if not all(isinstance(arr, np.ndarray) for arr in numpy_arrays):\n", " raise ValueError(\"All inputs must be either pandas DataFrame or numpy array\")\n", " return np.concatenate(numpy_arrays, axis=1)" ] }, { "cell_type": "code", "execution_count": 29, "metadata": {}, "outputs": [], "source": [ "group_nws = concatenate_to_numpy(x_ws, MACCS_ws, AvalonFP_ws)\n", "group_nde = concatenate_to_numpy(x_de, MACCS_de, AvalonFP_de)\n", "group_nlo = concatenate_to_numpy(x_lo, MACCS_lo, AvalonFP_lo)\n", "group_nhu = concatenate_to_numpy(x_hu, MACCS_hu, AvalonFP_hu)" ] }, { "cell_type": "code", "execution_count": 30, "metadata": {}, "outputs": [], "source": [ "BATCHSIZE = 32\n", "EPOCHS = 1000\n", "lr = 0.0001\n", "decay = 1e-4" ] }, { "cell_type": "code", "execution_count": 41, "metadata": {}, "outputs": [], "source": [ "def search_model(trial, input_dim):\n", " n_layers = trial.suggest_int(\"n_layers\", 1, 3)\n", " model = tf.keras.Sequential()\n", " model.add(tf.keras.layers.Input(shape=(input_dim,)))\n", " layer_dropout = trial.suggest_int(\"layer_dropout\", 0, 1)\n", " \n", " for i in range(n_layers):\n", " num_hidden = trial.suggest_int(f\"n_units_l_{i}\", 2, 10000)\n", " num_decay = trial.suggest_categorical(f\"n_decay_l_{i}\", [1e-3, 1e-4, 1e-5])\n", " model.add(\n", " tf.keras.layers.Dense(\n", " num_hidden,\n", " activation=\"relu\",\n", " kernel_initializer='glorot_uniform',\n", " kernel_regularizer=tf.keras.regularizers.l2(num_decay),\n", " )\n", " )\n", " if layer_dropout == 1:\n", " fdropout1 = trial.suggest_categorical(f\"F_dropout_{i}\", [0.1, 0.2, 0.3])\n", " model.add(tf.keras.layers.Dropout(rate=fdropout1))\n", " \n", " if layer_dropout == 0:\n", " fdropout2 = trial.suggest_categorical(\"last_dropout\", [0.1, 0.2, 0.3])\n", " model.add(tf.keras.layers.Dropout(rate=fdropout2))\n", " \n", " model.add(tf.keras.layers.Dense(units=1))\n", " # # Colab\n", " # learningr = trial.suggest_categorical(\"Learning_rate\",[0.01,0.001,0.0001])\n", " # model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate=learningr),\n", " # loss=tf.keras.losses.MeanSquaredError(),\n", " # metrics=[tf.keras.losses.MeanSquaredError(),\n", " # tf.keras.losses.MeanAbsoluteError(),\n", " # tf.keras.metrics.RootMeanSquaredError()])\n", " return model\n", "\n", "def save_model(trial, x_data):\n", " model_path = \"save_model/full_model.keras\"\n", " if not os.path.exists(model_path):\n", " try:\n", " model = search_model(trial, x_data.shape[1])\n", " os.makedirs(\"save_model\", exist_ok=True)\n", " model.save(model_path)\n", " print(f\"Model successfully saved to {model_path}\")\n", " except Exception as e:\n", " print(f\"Error saving model: {e}\")\n", " else:\n", " print(f\"Model already exists at {model_path}\")\n", " os.remove(model_path)\n", " save_model(trial, x_data)" ] }, { "cell_type": "code", "execution_count": 42, "metadata": {}, "outputs": [], "source": [ "import logging\n", "import warnings\n", "\n", "os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'\n", "os.environ['CUDA_VISIBLE_DEVICES'] = '0'\n", "os.environ['TF_GPU_ALLOCATOR'] = 'cuda_malloc_async'\n", "os.environ['CUDA_DEVICE_ORDER'] = 'PCI_BUS_ID'\n", "os.environ['TF_XLA_FLAGS'] = '--tf_xla_auto_jit=2 --tf_xla_enable_xla_devices'\n", "os.environ['XLA_FLAGS'] = '--xla_gpu_cuda_data_dir=/usr/local/cuda --xla_gpu_force_compilation_parallelism=1'\n", "os.environ['TF_FORCE_GPU_ALLOW_GROWTH'] = 'true'\n", "os.environ['TF_ENABLE_ONEDNN_OPTS'] = '0'\n", "os.environ['TF_NUMA_NODES'] = '1'\n", "\n", "warnings.filterwarnings('ignore')\n", "\n", "warnings.simplefilter(action='ignore', category=FutureWarning)\n", "\n", "logging.getLogger('tensorflow').setLevel(logging.ERROR)\n", "\n", "tf.get_logger().setLevel('ERROR')\n", "tf.autograph.set_verbosity(0)\n", "\n", "def suppress_warnings(condition=True):\n", " if condition:\n", " logging.getLogger('tensorflow').setLevel(logging.ERROR)\n", " os.environ['TF_CPP_MIN_LOG_LEVEL'] = '3'\n", " else:\n", " logging.getLogger('tensorflow').setLevel(logging.WARNING)\n", " os.environ['TF_CPP_MIN_LOG_LEVEL'] = '0'\n", "\n", "suppress_warnings(condition=True)" ] }, { "cell_type": "code", "execution_count": 43, "metadata": {}, "outputs": [], "source": [ "def objective_ws_struct(trial):\n", " try:\n", " y_true = np.asarray(y_ws).astype('float')\n", " np.save('new_fps.npy', group_nws)\n", " np.save('y_true.npy', y_true)\n", " \n", " save_model(trial, group_nws)\n", "\n", " lr = trial.suggest_categorical(f\"lr\", [1e-3, 1e-4, 1e-5])\n", "\n", " result = subprocess.run(['python3', './extra_code/learning_process.py', \n", " str(BATCHSIZE), str(EPOCHS), \n", " str(lr), \n", " 'new_fps.npy', 'y_true.npy'],\n", " stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)\n", "\n", " if result.stderr:\n", " filtered_stderr = '\\n'.join([line for line in result.stderr.split('\\n') if \"could not open file to read NUMA node\" not in line and \"Your kernel may have been built without NUMA support\" not in line])\n", " if filtered_stderr:\n", " print(f\"Error in subprocess: {filtered_stderr}\", file=sys.stderr)\n", "\n", " for line in result.stdout.splitlines():\n", " if \"R2\" in line:\n", " if \"(prune)\" in line:\n", " print(f\"Pruning trial due to poor R2: {line}\")\n", " r2_result = 0.0\n", " trial.report(r2_result, step=0)\n", " raise optuna.exceptions.TrialPruned()\n", " else:\n", " r2_result = float(line.split(\":\")[1].strip())\n", " print(f\"R2 score: {r2_result}\")\n", " trial.report(r2_result, step=0)\n", "\n", " if trial.should_prune():\n", " raise optuna.exceptions.TrialPruned()\n", "\n", " except Exception as e:\n", " print(f\"Exception occurred: {e}\", file=sys.stderr)\n", " r2_result = 0.0\n", "\n", " gc.collect()\n", "\n", " return r2_result" ] }, { "cell_type": "code", "execution_count": 44, "metadata": {}, "outputs": [], "source": [ "def objective_de_struct(trial):\n", " try:\n", " y_true = np.asarray(y_de).astype('float')\n", " np.save('new_fps.npy', group_nde)\n", " np.save('y_true.npy', y_true)\n", " \n", " save_model(trial, group_nde)\n", "\n", " lr = trial.suggest_categorical(f\"lr\", [1e-3, 1e-4, 1e-5])\n", "\n", " result = subprocess.run(['python3', './extra_code/learning_process.py', \n", " str(BATCHSIZE), str(EPOCHS), \n", " str(lr), \n", " 'new_fps.npy', 'y_true.npy'],\n", " stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)\n", "\n", " if result.stderr:\n", " filtered_stderr = '\\n'.join([line for line in result.stderr.split('\\n') if \"could not open file to read NUMA node\" not in line and \"Your kernel may have been built without NUMA support\" not in line])\n", " if filtered_stderr:\n", " print(f\"Error in subprocess: {filtered_stderr}\", file=sys.stderr)\n", "\n", " for line in result.stdout.splitlines():\n", " if \"R2\" in line:\n", " if \"(prune)\" in line:\n", " print(f\"Pruning trial due to poor R2: {line}\")\n", " r2_result = 0.0\n", " trial.report(r2_result, step=0)\n", " raise optuna.exceptions.TrialPruned()\n", " else:\n", " r2_result = float(line.split(\":\")[1].strip())\n", " print(f\"R2 score: {r2_result}\")\n", " trial.report(r2_result, step=0)\n", "\n", " if trial.should_prune():\n", " raise optuna.exceptions.TrialPruned()\n", "\n", " except Exception as e:\n", " print(f\"Exception occurred: {e}\", file=sys.stderr)\n", " r2_result = 0.0\n", "\n", " gc.collect()\n", "\n", " return r2_result" ] }, { "cell_type": "code", "execution_count": 45, "metadata": {}, "outputs": [], "source": [ "def objective_lo_struct(trial):\n", " try:\n", " y_true = np.asarray(y_lo).astype('float')\n", " np.save('new_fps.npy', group_nlo)\n", " np.save('y_true.npy', y_true)\n", " \n", " save_model(trial, group_nlo)\n", "\n", " lr = trial.suggest_categorical(f\"lr\", [1e-3, 1e-4, 1e-5])\n", "\n", " result = subprocess.run(['python3', './extra_code/learning_process.py', \n", " str(BATCHSIZE), str(EPOCHS), \n", " str(lr), \n", " 'new_fps.npy', 'y_true.npy'],\n", " stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)\n", "\n", " if result.stderr:\n", " filtered_stderr = '\\n'.join([line for line in result.stderr.split('\\n') if \"could not open file to read NUMA node\" not in line and \"Your kernel may have been built without NUMA support\" not in line])\n", " if filtered_stderr:\n", " print(f\"Error in subprocess: {filtered_stderr}\", file=sys.stderr)\n", "\n", " for line in result.stdout.splitlines():\n", " if \"R2\" in line:\n", " if \"(prune)\" in line:\n", " print(f\"Pruning trial due to poor R2: {line}\")\n", " r2_result = 0.0\n", " trial.report(r2_result, step=0)\n", " raise optuna.exceptions.TrialPruned()\n", " else:\n", " r2_result = float(line.split(\":\")[1].strip())\n", " print(f\"R2 score: {r2_result}\")\n", " trial.report(r2_result, step=0)\n", "\n", " if trial.should_prune():\n", " raise optuna.exceptions.TrialPruned()\n", "\n", " except Exception as e:\n", " print(f\"Exception occurred: {e}\", file=sys.stderr)\n", " r2_result = 0.0\n", "\n", " gc.collect()\n", "\n", " return r2_result" ] }, { "cell_type": "code", "execution_count": 46, "metadata": {}, "outputs": [], "source": [ "def objective_hu_struct(trial):\n", " try:\n", " y_true = np.asarray(y_hu).astype('float')\n", " np.save('new_fps.npy', group_nhu)\n", " np.save('y_true.npy', y_true)\n", " \n", " save_model(trial, group_nhu)\n", "\n", " lr = trial.suggest_categorical(f\"lr\", [1e-3, 1e-4, 1e-5])\n", "\n", " result = subprocess.run(['python3', './extra_code/learning_process.py', \n", " str(BATCHSIZE), str(EPOCHS), \n", " str(lr), \n", " 'new_fps.npy', 'y_true.npy'],\n", " stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)\n", "\n", " if result.stderr:\n", " filtered_stderr = '\\n'.join([line for line in result.stderr.split('\\n') if \"could not open file to read NUMA node\" not in line and \"Your kernel may have been built without NUMA support\" not in line])\n", " if filtered_stderr:\n", " print(f\"Error in subprocess: {filtered_stderr}\", file=sys.stderr)\n", "\n", " for line in result.stdout.splitlines():\n", " if \"R2\" in line:\n", " if \"(prune)\" in line:\n", " print(f\"Pruning trial due to poor R2: {line}\")\n", " r2_result = 0.0\n", " trial.report(r2_result, step=0)\n", " raise optuna.exceptions.TrialPruned()\n", " else:\n", " r2_result = float(line.split(\":\")[1].strip())\n", " print(f\"R2 score: {r2_result}\")\n", " trial.report(r2_result, step=0)\n", "\n", " if trial.should_prune():\n", " raise optuna.exceptions.TrialPruned()\n", "\n", " except Exception as e:\n", " print(f\"Exception occurred: {e}\", file=sys.stderr)\n", " r2_result = 0.0\n", "\n", " gc.collect()\n", "\n", " return r2_result" ] }, { "cell_type": "code", "execution_count": 47, "metadata": {}, "outputs": [], "source": [ "storage = optuna.storages.RDBStorage(url=\"sqlite:///ano_analysis.db\", engine_kwargs={\"connect_args\": {\"timeout\": 10000}})\n", "# storage_urls = \"postgresql+psycopg2://postgres:{pwd}}@localhost:{num}}\"\n", "# storage = optuna.storages.RDBStorage(url=storage_urls)" ] }, { "cell_type": "code", "execution_count": 48, "metadata": {}, "outputs": [], "source": [ "try:\n", " optuna.delete_study(study_name=\"ANO_ws_struct\", storage=storage)\n", " optuna.delete_study(study_name=\"ANO_de_struct\", storage=storage)\n", " optuna.delete_study(study_name=\"ANO_lo_struct\", storage=storage)\n", " optuna.delete_study(study_name=\"ANO_hu_struct\", storage=storage)\n", "except:\n", " pass" ] }, { "cell_type": "code", "execution_count": 49, "metadata": {}, "outputs": [], "source": [ "TRIALS = 5" ] }, { "cell_type": "code", "execution_count": 50, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "[I 2024-10-25 11:27:16,193] A new study R2 score: 0.72685
[I 2024-10-25 11:27:28,379] Trial 0 finished with value: 0.72685 and parameters: {'n_layers': 1, 'layer_dropout': 0, 'n_units_l_0': 9922, 'n_decay_l_0': 1e-05, 'last_dropout': 0.1, 'lr': 0.001}. Best is trial 0 with value: 0.72685. Model already exists at save_model/full_model.keras
Model successfully saved to save_model/full_model.keras R2 score: 0.707063 [I 2024-10-25 11:28:11,955] Trial 1 finished with value: 0.707063 and parameters: {'n_layers': 2, 'layer_dropout': 0, 'n_units_l_0': 6572, 'n_decay_l_0': 1e-05, 'n_units_l_1': 1332, 'n_decay_l_1': 0.0001, 'last_dropout': 0.3, 'lr': 1e-05}. Best is trial 0 with value: 0.72685. Model already exists at save_model/full_model.keras
Model successfully saved to save_model/full_model.keras R2 score: 0.705862 [I 2024-10-25 11:28:23,131] Trial 2 finished with value: 0.705862 and parameters: {'n_layers': 1, 'layer_dropout': 1, 'n_units_l_0': 3241, 'n_decay_l_0': 1e-05, 'F_dropout_0': 0.2, 'lr': 0.0001}. Best is trial 0 with value: 0.72685. Model already exists at save_model/full_model.keras
Model successfully saved to save_model/full_model.keras R2 score: 0.741337 [I 2024-10-25 11:30:53,672] Trial 3 finished with value: 0.741337 and parameters: {'n_layers': 3, 'layer_dropout': 0, 'n_units_l_0': 787, 'n_decay_l_0': 0.0001, 'n_units_l_1': 9082, 'n_decay_l_1': 0.001, 'n_units_l_2': 7890, 'n_decay_l_2': 0.001, 'last_dropout': 0.1, 'lr': 0.0001}. Best is trial 3 with value: 0.741337. Model already exists at save_model/full_model.keras
Model successfully saved to save_model/full_model.keras R2 score: 0.68373 [I 2024-10-25 11:31:13,484] Trial 4 finished with value: 0.68373 and parameters: {'n_layers': 1, 'layer_dropout': 0, 'n_units_l_0': 1253, 'n_decay_l_0': 0.0001, 'last_dropout': 0.2, 'lr': 1e-05}. Best is trial 3 with value: 0.741337. [I 2024-10-25 11:31:13,504] A new study created in RDB with name: ANO_de_struct
Model already exists at save_model/full_model.keras
Model successfully saved to save_model/full_model.keras R2 score: 0.803869 [I 2024-10-25 11:32:02,303] Trial 0 finished with value: 0.803869 and parameters: {'n_layers': 2, 'layer_dropout': 0, 'n_units_l_0': 1737, 'n_decay_l_0': 1e-05, 'n_units_l_1': 6702, 'n_decay_l_1': 1e-05, 'last_dropout': 0.2, 'lr': 0.001}. Best is trial 0 with value: 0.803869.
Model already exists at save_model/full_model.keras
Model successfully saved to save_model/full_model.keras R2 score: 0.826782 [I 2024-10-25 11:32:39,308] Trial 1 finished with value: 0.826782 and parameters: {'n_layers': 2, 'layer_dropout': 1, 'n_units_l_0': 9935, 'n_decay_l_0': 1e-05, 'F_dropout_0': 0.2, 'n_units_l_1': 3544, 'n_decay_l_1': 1e-05, 'F_dropout_1': 0.3, 'lr': 0.0001}. Best is trial 1 with value: 0.826782.
Model already exists at save_model/full_model.keras
Model successfully saved to save_model/full_model.keras R2 score: 0.823751 [I 2024-10-25 11:33:33,879] Trial 2 finished with value: 0.823751 and parameters: {'n_layers': 2, 'layer_dropout': 1, 'n_units_l_0': 7233, 'n_decay_l_0': 1e-05, 'F_dropout_0': 0.2, 'n_units_l_1': 4859, 'n_decay_l_1': 0.0001, 'F_dropout_1': 0.3, 'lr': 0.001}. Best is trial 1 with value: 0.826782.
Model already exists at save_model/full_model.keras
Model successfully saved to save_model/full_model.keras R2 score: 0.796098 [I 2024-10-25 11:34:39,639] Trial 3 finished with value: 0.796098 and parameters: {'n_layers': 3, 'layer_dropout': 1, 'n_units_l_0': 809, 'n_decay_l_0': 1e-05, 'F_dropout_0': 0.3, 'n_units_l_1': 3939, 'n_decay_l_1': 0.0001, 'F_dropout_1': 0.2, 'n_units_l_2': 6198, 'n_decay_l_2': 0.0001, 'F_dropout_2': 0.1, 'lr': 0.001}. Best is trial 1 with value: 0.826782.
Model already exists at save_model/full_model.keras
Model successfully saved to save_model/full_model.keras R2 score: 0.84961 [I 2024-10-25 11:37:05,301] Trial 4 finished with value: 0.84961 and parameters: {'n_layers': 2, 'layer_dropout': 0, 'n_units_l_0': 7109, 'n_decay_l_0': 0.001, 'n_units_l_1': 3436, 'n_decay_l_1': 1e-05, 'last_dropout': 0.3, 'lr': 1e-05}. Best is trial 4 with value: 0.84961. [I 2024-10-25 11:37:05,323] A new study created in RDB with name: ANO_lo_struct
Model already exists at save_model/full_model.keras
Model successfully saved to save_model/full_model.keras R2 score: 0.679332 [I 2024-10-25 11:39:42,324] Trial 0 finished with value: 0.679332 and parameters: {'n_layers': 2, 'layer_dropout': 1, 'n_units_l_0': 7114, 'n_decay_l_0': 0.001, 'F_dropout_0': 0.1, 'n_units_l_1': 7475, 'n_decay_l_1': 0.0001, 'F_dropout_1': 0.3, 'lr': 1e-05}. Best is trial 0 with value: 0.679332.
Model already exists at save_model/full_model.keras
Model successfully saved to save_model/full_model.keras R2 score: 0.668488 [I 2024-10-25 11:40:50,103] Trial 1 finished with value: 0.668488 and parameters: {'n_layers': 3, 'layer_dropout': 1, 'n_units_l_0': 2152, 'n_decay_l_0': 0.001, 'F_dropout_0': 0.1, 'n_units_l_1': 1830, 'n_decay_l_1': 0.0001, 'F_dropout_1': 0.1, 'n_units_l_2': 4427, 'n_decay_l_2': 0.0001, 'F_dropout_2': 0.3, 'lr': 0.0001}. Best is trial 0 with value: 0.679332.
Model already exists at save_model/full_model.keras
Model successfully saved to save_model/full_model.keras R2 score: 0.662751 [I 2024-10-25 11:41:01,389] Trial 2 finished with value: 0.662751 and parameters: {'n_layers': 1, 'layer_dropout': 0, 'n_units_l_0': 2892, 'n_decay_l_0': 0.001, 'last_dropout': 0.1, 'lr': 0.001}. Best is trial 0 with value: 0.679332.
Model already exists at save_model/full_model.keras
Model successfully saved to save_model/full_model.keras R2 score: 0.644237 [I 2024-10-25 11:41:12,311] Trial 3 finished with value: 0.644237 and parameters: {'n_layers': 1, 'layer_dropout': 0, 'n_units_l_0': 6028, 'n_decay_l_0': 0.0001, 'last_dropout': 0.2, 'lr': 0.001}. Best is trial 0 with value: 0.679332.
Model already exists at save_model/full_model.keras
Model successfully saved to save_model/full_model.keras R2 score: 0.619821 [I 2024-10-25 11:41:23,815] Trial 4 finished with value: 0.619821 and parameters: {'n_layers': 1, 'layer_dropout': 1, 'n_units_l_0': 8402, 'n_decay_l_0': 1e-05, 'F_dropout_0': 0.2, 'lr': 0.001}. Best is trial 0 with value: 0.679332. [I 2024-10-25 11:41:23,834] A new study created in RDB with name: ANO_hu_struct
Model already exists at save_model/full_model.keras
Model successfully saved to save_model/full_model.keras R2 score: 0.856321
[I 2024-10-25 11:42:47,320] Trial 0 finished with value: 0.856321 and parameters: {'n_layers': 2, 'layer_dropout': 0, 'n_units_l_0': 6594, 'n_decay_l_0': 0.0001, 'n_units_l_1': 301, 'n_decay_l_1': 0.001, 'last_dropout': 0.3, 'lr': 1e-05}. Best is trial 0 with value: 0.856321.
Model already exists at save_model/full_model.keras
Model successfully saved to save_model/full_model.keras R2 score: 0.852474 [I 2024-10-25 11:46:37,297] Trial 1 finished with value: 0.852474 and parameters: {'n_layers': 2, 'layer_dropout': 1, 'n_units_l_0': 6712, 'n_decay_l_0': 0.0001, 'F_dropout_0': 0.1, 'n_units_l_1': 6556, 'n_decay_l_1': 0.001, 'F_dropout_1': 0.1, 'lr': 1e-05}. Best is trial 0 with value: 0.856321.
Model already exists at save_model/full_model.keras
Model successfully saved to save_model/full_model.keras R2 score: 0.839939 [I 2024-10-25 11:46:47,755] Trial 2 finished with value: 0. Devices:\n", "I0000 00:00:1729824410.834333 789515 service.cc:154] StreamExecutor device (0): NVIDIA GeForce RTX 3060 Laptop GPU, Compute Capability 8.6\n", "I0000 00:00:1729824435.249758 789626 device_compiler.h:188] Compiled cluster using XLA! This line is logged at most once for the lifetime of the process.\n", "\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "R2 score: 0.845373\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "[I 2024-10-25 11:48:32,252] Trial 3 finished with value: 0.845373 and parameters: {'n_layers': 3, 'layer_dropout': 0, 'n_units_l_0': 4463, 'n_decay_l_0': 0.0001, 'n_units_l_1': 1966, 'n_decay_l_1': 0.001, 'n_units_l_2': 924, 'n_decay_l_2': 0.0001, 'last_dropout': 0.1, 'lr': 0.0001}. Best is trial 0 with value: 0.856321.\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "Model already exists at save_model/full_model.keras\n", "Model successfully saved to save_model/full_model.keras\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Error in subprocess: WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n", "WARNING: All log messages before absl::InitializeLog() is called are written to STDERR\n", "I0000 00:00:1729824515.181650 795905 service.cc:146] XLA service 0x55dbccb5d560 initialized for platform Host (this does not guarantee that XLA will be used). Devices:\n", "I0000 00:00:1729824515.181691 795905 service.cc:154] StreamExecutor device (0): Host, Default Version\n", "I0000 00:00:1729824515.312754 795905 service.cc:146] XLA service 0x55dbccb71e90 initialized for platform CUDA (this does not guarantee that XLA will be used). Devices:\n", "I0000 00:00:1729824515.312792 795905 service.cc:154] StreamExecutor device (0): NVIDIA GeForce RTX 3060 Laptop GPU, Compute Capability 8.6\n", "I0000 00:00:1729824518.617949 796015 device_compiler.h:188] Compiled cluster using XLA! This line is logged at most once for the lifetime of the process.\n", "\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "R2 score: 0.812906\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "[I 2024-10-25 11:49:05,409] Trial 4 finished with value: 0.812906 and parameters: {'n_layers': 2, 'layer_dropout': 1, 'n_units_l_0': 1477, 'n_decay_l_0': 0.0001, 'F_dropout_0': 0.3, 'n_units_l_1': 5762, 'n_decay_l_1': 1e-05, 'F_dropout_1': 0.2, 'lr': 0.001}. Best is trial 0 with value: 0.856321.\n" ] } ], "source": [ "study_hu_struct = optuna.create_study(study_name='ANO_hu_struct', storage=storage, direction=\"maximize\", pruner=optuna.pruners.SuccessiveHalvingPruner(reduction_factor=64, min_early_stopping_rate=10),load_if_exists=True) \n", "# study_hu_fea = optuna.create_study(study_name='ANO_hu_struct', storage=storage, direction=\"maximize\", pruner=optuna.pruners.HyperbandPruner(min_resource=100,max_resource=1000,reduction_factor=3), load_if_exists=True)\n", "study_hu_struct.optimize(objective_hu_struct, n_trials=TRIALS)\n", "pruned_trials_hu_struct = study_hu_struct.get_trials(deepcopy=False, states=[TrialState.PRUNED])\n", "complete_trials_hu_struct = study_hu_struct.get_trials(deepcopy=False, states=[TrialState.COMPLETE])" ] }, { "cell_type": "code", "execution_count": 58, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Study statistics: [ws_structure] \n", " Number of finished trials: 5\n", " Number of pruned trials: 0\n", " Number of complete trials: 5\n", "Best trial:\n", " Value: 0.741337\n", " Params: \n", " n_layers: 3\n", " layer_dropout: 0\n", " n_units_l_0: 787\n", " n_decay_l_0: 0.0001\n", " n_units_l_1: 9082\n", " n_decay_l_1: 0.001\n", " n_units_l_2: 7890\n", " n_decay_l_2: 0.001\n", " last_dropout: 0.1\n", " lr: 0.0001\n" ] } ], "source": [ "print(\"Study statistics: [ws_structure] \")\n", "print(\" Number of finished trials: \", len(study_ws_struct.trials))\n", "print(\" Number of pruned trials: \", len(pruned_trials_ws_struct))\n", "print(\" Number of complete trials: \", len(complete_trials_ws_struct))\n", "print(\"Best trial:\")\n", "trials_tmp = study_ws_struct.best_trial\n", "print(\" Value: \", trials_tmp.value)\n", "print(\" Params: \")\n", "for key, value in trials_tmp.params.items():\n", " print(\" {}: {}\".format(key, value))" ] }, { "cell_type": "code", "execution_count": 59, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Study statistics: [de_structure] \n", " Number of finished trials: 5\n", " Number of pruned trials: 0\n", " Number of complete trials: 5\n", "Best trial:\n", " Value: 0.84961\n", " Params: \n", " n_layers: 2\n", " layer_dropout: 0\n", " n_units_l_0: 7109\n", " n_decay_l_0: 0.001\n", " n_units_l_1: 3436\n", " n_decay_l_1: 1e-05\n", " last_dropout: 0.3\n", " lr: 1e-05\n" ] } ], "source": [ "print(\"Study statistics: [de_structure] \")\n", "print(\" Number of finished trials: \", len(study_de_struct.trials))\n", "print(\" Number of pruned trials: \", len(pruned_trials_de_struct))\n", "print(\" Number of complete trials: \", len(complete_trials_de_struct))\n", "print(\"Best trial:\")\n", "trials_tmp = study_de_struct.best_trial\n", "print(\" Value: \", trials_tmp.value)\n", "print(\" Params: \")\n", "for key, value in trials_tmp.params.items():\n", " print(\" {}: {}\".format(key, value))" ] }, { "cell_type": "code", "execution_count": 60, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Study statistics: [lo_structure] \n", " Number of finished trials: 5\n", " Number of pruned trials: 0\n", " Number of complete trials: 5\n", "Best trial:\n", " Value: 0.679332\n", " Params: \n", " n_layers: 2\n", " layer_dropout: 1\n", " n_units_l_0: 7114\n", " n_decay_l_0: 0.001\n", " F_dropout_0: 0.1\n", " n_units_l_1: 7475\n", " n_decay_l_1: 0.0001\n", " F_dropout_1: 0.3\n", " lr: 1e-05\n" ] } ], "source": [ "print(\"Study statistics: [lo_structure] \")\n", "print(\" Number of finished trials: \", len(study_lo_struct.trials))\n", "print(\" Number of pruned trials: \", len(pruned_trials_lo_struct))\n", "print(\" Number of complete trials: \", len(complete_trials_lo_struct))\n", "print(\"Best trial:\")\n", "trials_tmp = study_lo_struct.best_trial\n", "print(\" Value: \", trials_tmp.value)\n", "print(\" Params: \")\n", "for key, value in trials_tmp.params.items():\n", " print(\" {}: {}\".format(key, value))" ] }, { "cell_type": "code", "execution_count": 61, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Study statistics: [hu_structure] \n", " Number of finished trials: 5\n", " Number of pruned trials: 0\n", " Number of complete trials: 5\n", "Best trial:\n", " Value: 0.856321\n", " Params: \n", " n_layers: 2\n", " layer_dropout: 0\n", " n_units_l_0: 6594\n", " n_decay_l_0: 0.0001\n", " n_units_l_1: 301\n", " n_decay_l_1: 0.001\n", " last_dropout: 0.3\n", " lr: 1e-05\n" ] } ], "source": [ "print(\"Study statistics: [hu_structure] \")\n", "print(\" Number of finished trials: \", len(study_hu_struct.trials))\n", "print(\" Number of pruned trials: \", len(pruned_trials_hu_struct))\n", "print(\" Number of complete trials: \", len(complete_trials_hu_struct))\n", "print(\"Best trial:\")\n", "trials_tmp = study_hu_struct.best_trial\n", "print(\" Value: \", trials_tmp.value)\n", "print(\" Params: \")\n", "for key, value in trials_tmp.params.items():\n", " print(\" {}: {}\".format(key, value))" ] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] }, { "cell_type": "code", "execution_count": null, "metadata": {}, "outputs": [], "source": [] } ], "metadata": { "kernelspec": { "display_name": "ai", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.2" }, "orig_nbformat": 4 }, "nbformat": 4, "nbformat_minor": 2 }