-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathvirtree.py
130 lines (105 loc) · 4.72 KB
/
virtree.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
#!/usr/bin/env python
import sys, os, re, shlex, subprocess, pandas, argparse
from collections import defaultdict
#hmmdb = "hmm/vog_05p.hmm"
# run HMMER3
def run_hmmer(input_file, cpus, evalue, database):
output_file = re.sub(".faa", ".hmmout", input_file)
cmd = "hmmsearch -E "+ str(evalue) +" --cpu "+ cpus +" --tblout "+ output_file +" "+ database +" "+ input_file
print(cmd)
cmd2 = shlex.split(cmd)
subprocess.call(cmd2, stdout=open("out.txt", "w"), stderr=open("err.txt", "w"))
os.remove("out.txt")
return output_file
# define function for parsing HMMER3 output
def parse_hmmout(hmmout):
input = open(hmmout, "r")
final_dict = defaultdict(int)
hit_dict = defaultdict(lambda:"no_annot")
bit_dict = defaultdict(float)
for i in input.readlines():
line = i.rstrip()
if line.startswith("#"):
pass
else:
newline = re.sub("\s+", "\t", line)
tabs = newline.split("\t")
protein = tabs[0]
hit = tabs[2]
eval = float(tabs[4])
score = float(tabs[5])
if score > 30:
if score > bit_dict[protein]:
bit_dict[protein] = score
hit_dict[protein] = hit
else:
pass
else:
pass
for i in hit_dict:
vog = hit_dict[i]
final_dict[vog] +=1
return final_dict
# main function that runs the program
def run_program(inputdir, project, database, minhit, evalue, cpus, redo):
df = pandas.DataFrame()
for i in os.listdir(inputdir):
if i.endswith(".faa"):
#name = re.sub("_genomic.fna.faa", "", i)
inputfile = os.path.join(inputdir, i)
#print(inputfile)
#protein_file = predict_proteins(inputfile, inputdir)
hmmout = run_hmmer(inputfile, cpus, evalue, database)
hit_dict = parse_hmmout(hmmout)
# fulllist = i.split(".")
# genomelist = fulllist[0:-2]
# genome = ".".join(genomelist)
genome = re.sub(".faa", "", i)
#print(genome)
s1 = pandas.DataFrame(pandas.Series(hit_dict, name = genome))
df = pandas.concat([df, s1], axis=1, sort=True)
df = df.fillna(0)
df2 = df.clip(0,1)
prof_tbl = project + "_profile.tsv"
df2.to_csv(prof_tbl,sep="\t")
#if BINARY == 1:
# df2 = df.clip(upper=1)
#else:
# df2 = df.clip(upper=9)
outputfile = project + "_bin.fna"
o = open(outputfile, "w")
for (columnName, columnData) in df2.items():
vogsum = sum([float(n) for n in columnData])
if vogsum >= minhit:
#print(columnName, vogsum)
string = "".join([str(int(n)) for n in columnData])
o.write(">"+ columnName +"\n"+ string +"\n")
########################################################################
##### use argparse to run through the command line options given #######
########################################################################
def main(argv=None):
args_parser = argparse.ArgumentParser(formatter_class=argparse.RawDescriptionHelpFormatter, description="hmmtree: Part of a workflow for making trees based on protein family presence/absence \nFrank O. Aylward, Virginia Tech Department of Biological Sciences <faylward at vt dot edu>", epilog='*******************************************************************\n\n*******************************************************************')
args_parser.add_argument('-i', '--inputdir', required=True, help='Input folder of protein FASTA files (ending in .faa)')
args_parser.add_argument('-p', '--project', required=True, help='project name for outputs')
args_parser.add_argument('-db', '--database', required=False, default="hmm/vog_05p.hmm", help='PATH to HMM database to use. Default is hmm/vog_05p.hmm (default), vog_025p.hmm, and vog_005p.hmm. See README for details')
args_parser.add_argument('-g', '--minhit', required=False, default=int(1), help='minimum number of VOG hits that each viral region must have to be reported (default=4)')
args_parser.add_argument('-e', '--evalue', required=False, default=str(1e-3), help='e-value that is passed to HMMER3 for the VOG hmmsearch (default=1e-10)')
args_parser.add_argument('-t', '--cpus', required=False, default=str(1), help='number of cpus to use for the HMMER3 search')
args_parser.add_argument('-r', '--redo', type=bool, default=False, const=True, nargs='?', help='run without re-launching prodigal and HMMER3 (for quickly re-calculating outputs with different parameters if you have already run once)')
args_parser.add_argument('-v', '--version', action='version', version='ViralRecall v. 2.1')
args_parser = args_parser.parse_args()
# set up object names for input/output/database folders
inputdir = args_parser.inputdir
project = args_parser.project
database = args_parser.database
minhit = int(args_parser.minhit)
evalue = str(args_parser.evalue)
cpus = args_parser.cpus
redo = args_parser.redo
project = project.rstrip("/")
run_program(inputdir, project, database, minhit, evalue, cpus, redo)
return 0
if __name__ == '__main__':
status = main()
sys.exit(status)
# end