diff --git a/code/Datafusion1.0.ipynb b/code/Datafusion1.0.ipynb new file mode 100644 index 0000000..8485feb --- /dev/null +++ b/code/Datafusion1.0.ipynb @@ -0,0 +1,559 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 95, + "metadata": {}, + "outputs": [], + "source": [ + "# Funktionen comparee_all_coefficients(smiles), tar fram en matris \n", + "# som visar nivån av likhet mellan alla molekyler två och två med datafused\n", + "# likhetskoefficienter.\n", + "# Anledningen till varför hälften av matrisen har NaN värden är för att annars blir det\n", + "# en upprepniing av alla värden. \n", + "\n", + "# Funktionen compare_all_coefiicients(smiles)\n", + "# 1. Skriver ut värdena för 4 olika likhetsmått. En datafusion görs som \n", + "# skrivs ut. \n", + "\n", + "# 2. Kollar om de olika likhetskoefficienterna kommer fram till \n", + "# samma molekyl som är mest lik referensmolekylen. Om de inte gör det \n", + "# skrivs ett meddelande ut som säger vilka likhetskoefficienter som \n", + "# inte överenstämmer.\n", + "\n", + "# 3.Radar upp alla molekyler och motsvarande molekyl som är mest lik." + ] + }, + { + "cell_type": "code", + "execution_count": 1, + "metadata": {}, + "outputs": [], + "source": [ + "from rdkit import Chem\n", + "from rdkit import DataStructs\n", + "from rdkit.Chem import AllChem, MACCSkeys\n", + "from rdkit.Chem.Fingerprints import FingerprintMols\n", + "#Urpsungliga fps, ta inte hänsyn till om refernsen är i smiles strängen. \n", + "descriptors = {\n", + " 'maccs': lambda m: MACCSkeys.GenMACCSKeys(m),\n", + " 'morgan3': lambda m: AllChem.GetMorganFingerprintAsBitVect(m,3),\n", + " 'morgan5': lambda m: AllChem.GetMorganFingerprintAsBitVect(m,5),\n", + " 'rdkit': lambda m: FingerprintMols.FingerprintMol(m)\n", + "}\n", + "\n", + "metrics = {\n", + " 'asymmetric': DataStructs.AsymmetricSimilarity,\n", + " 'braunblanquet': DataStructs.BulkBraunBlanquetSimilarity,\n", + " 'cosine': DataStructs.BulkCosineSimilarity,\n", + " 'dice': DataStructs.BulkDiceSimilarity,\n", + " 'kulczynski': DataStructs.BulkKulczynskiSimilarity,\n", + " 'mcconnaughey': DataStructs.BulkMcConnaugheySimilarity,\n", + " 'rogotgoldberg': DataStructs.BulkRogotGoldbergSimilarity,\n", + " 'russel': DataStructs.BulkRusselSimilarity,\n", + " 'sokal': DataStructs.BulkSokalSimilarity,\n", + " 'tanimoto': DataStructs.BulkTanimotoSimilarity\n", + "}\n", + "\n", + "\"\"\"\n", + "Returns a list of similarity scores for a list of smiles strings compared to a\n", + "reference compound. The fingerprints and similarity coefficients can be chosen\n", + "from the list of descriptors and metrics (default 'rdkit' and 'tanimoto').\n", + "\"\"\"\n", + "def fpss_sim(ref, smiles, descriptor='rdkit', metric='tanimoto'):\n", + " # ref - reference smiles\n", + " # smiles - list of smiles to compare with\n", + " # descriptor - fingerprint type from \"descriptors\", default 'rdkit'\n", + " # similarity score from \"metrics\", default 'tanimoto'\n", + "\n", + " if descriptor not in descriptors:\n", + " raise ValueError('Invalid descriptor name ' + descriptor)\n", + "\n", + " if metric not in metrics:\n", + " raise ValueError('Invalid metric ' + metric)\n", + " \n", + " \n", + " ref_ms = Chem.MolFromSmiles(ref)\n", + " ms=[]\n", + " fps=[]\n", + " for x in smiles: \n", + " \n", + " lo=Chem.MolFromSmiles(x)\n", + " ms.append(lo)\n", + " fingerprint = descriptors[descriptor]\n", + " ref_fps = fingerprint(ref_ms)\n", + " fps.append(fingerprint(lo))\n", + " score = metrics[metric] \n", + " \n", + "\n", + " \n", + " return score(ref_fps, fps)" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": {}, + "outputs": [], + "source": [ + "import numpy as np\n", + "from sklearn.feature_extraction import DictVectorizer\n", + "import pandas as pd\n", + "from copy import copy\n", + "#itertols.combination\n", + "#Får ut en matris av alla Koefficienter i smiles. \n", + "def comparee_all_coefficients(smiles): \n", + " \n", + " datafusion1=[]\n", + "\n", + " scores=[\"Tan\",\"Cos\",\"Dice\",\"Sokal\"]\n", + " scores.sort()\n", + "\n", + " count=0\n", + " ref1=[]\n", + "\n", + " while count < len(smiles):\n", + "\n", + " ref=smiles[count]\n", + " \n", + " \n", + " tan=fpss_sim(ref, smiles, descriptor='rdkit', metric='tanimoto')\n", + " dice=fpss_sim(ref, smiles, descriptor='rdkit', metric='dice')\n", + " cos=fpss_sim(ref, smiles, descriptor='rdkit', metric='cosine')\n", + " sokal=fpss_sim(ref, smiles, descriptor='rdkit', metric='sokal')\n", + " \n", + "\n", + " df = pd.DataFrame({'Cos':cos,'Dice':dice, 'Sokal': sokal, 'Tan':tan}, index=smiles)\n", + " covariance = df.cov()\n", + " #print(df)\n", + "\n", + " #Får ut datafusion i vektor \n", + " datafusion=[]\n", + " col_list= list(df)\n", + " data=(df.loc[:,col_list].sum(axis=1).values)/4\n", + " for i in data:\n", + " datafusion.append(i)\n", + " \n", + " datafusion1.append(datafusion)\n", + " \n", + " \n", + " \n", + " count+=1\n", + " ref1.append(ref)\n", + " df1 = pd.DataFrame(datafusion1,columns=smiles,index=ref1,dtype=float)\n", + " ble=df1.values\n", + " \n", + " n=0\n", + " k=0\n", + " while n