-
Notifications
You must be signed in to change notification settings - Fork 7
/
Copy pathrdkit_fingerprint.py
73 lines (66 loc) · 2.38 KB
/
rdkit_fingerprint.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
import numpy as np
from rdkit.Chem import AllChem, DataStructs
from rdkit import Chem
import csv
def load_csv(file):
# load data
expression = []
with open(file, "r") as csv_file:
reader = csv.reader(csv_file, dialect='excel')
for row in reader:
expression.append(row)
return expression
def get_feature_dict(file, delimiter=',', key_index=0, use_int=False):
with open(file, "r") as csv_file:
reader = csv.reader(csv_file, dialect='excel', delimiter=delimiter)
next(reader)
if use_int:
my_dict = {}
for row in reader:
list = []
for value in row[1:]:
list.append(int(value))
my_dict[row[key_index]] = list
return my_dict
return dict((row[key_index], row[1:]) for row in reader)
i = 0
finger_dimension = 2048
molecules = []
fps = []
id = []
smiles = []
names = []
import os
path = os.path.dirname(os.path.abspath(__file__))
print(path)
drug_dict = get_feature_dict('GSE92742_Broad_LINCS_pert_info.txt', delimiter='\t', use_int=False) # uncomment for phase 1
# drug_dict = get_feature_dict('GSE70138_Broad_LINCS_pert_info.txt', delimiter='\t', use_int=False) # uncomment for phase 2
# rnaseq drugs # uncomment this and change filename below to get inhouse_morgan_2048.csv
# drug_dict = {}
# drug_dict['Enzalutamide'] = ['','','','','','CNC(=O)C1=C(F)C=C(C=C1)N1C(=S)N(C(=O)C1(C)C)C1=CC=C(C#N)C(=C1)C(F)(F)F']
# drug_dict['VPC14449'] = ['','','','','','Brc1n(-c2nc(N3CCOCC3)sc2)cc(Br)n1']
# drug_dict['VPC17005'] = ['','','','','','O=C(NC=1SCCN=1)c1c2c(sc1)cccc2']
count = 0
for key in drug_dict:
count += 1
try:
smiles = drug_dict[key][5]
m = Chem.MolFromSmiles(smiles)
molecules.append(m)
fp = np.zeros((1,))
DataStructs.ConvertToNumpyArray(AllChem.GetMorganFingerprintAsBitVect(m, 2, nBits=finger_dimension), fp)
fps.append(fp)
id.append(key)
except:
print(i, key, m)
i += 1
header = ["mol"]
for i in range(finger_dimension):
header.append("fps" + str(i))
fps = np.array(fps).reshape(len(fps),finger_dimension)
id = np.array(id)
id = id.reshape(len(fps), 1)
data = np.hstack((id, fps))
header = np.array(header).reshape(1, len(header))
data_header = np.vstack((header, data))
np.savetxt("phase1_compounds_morgan_2048.csv", data_header, delimiter=",", fmt="%s")