Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update to v1.50 #29

Merged
merged 12 commits into from
Oct 22, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions bin/lsaBGC-Ready.py
Original file line number Diff line number Diff line change
Expand Up @@ -101,7 +101,7 @@ def create_parser():
parser.add_argument('-p', '--bgc_prediction_software', help='Software used to predict BGCs (Options: antiSMASH, DeepBGC, GECCO).\n[Default is antiSMASH].', default='antiSMASH', required=False)
parser.add_argument('-b', '--bigscape_results', help='Path to BiG-SCAPE results directory of antiSMASH/DeepBGC/GECCO results predicted\nin primary genomes. Please make sure the sample names match what is provided for "--genome_listings".', required=False, default=None)
parser.add_argument('-o', '--output_directory', help='Parent output/workspace directory.', required=True)
parser.add_argument('-om', '--ortholog_method', help="Software for inference of ortholog groups. (Options: OrthoFinder, SonicParanoid).\n[Default is OrthoFinder].", default='OrthoFinder', required=False)
parser.add_argument('-om', '--ortholog_method', help="Software for inference of ortholog groups. (Options: OrthoFinder, SonicParanoid, & Panaroo).\n[Default is OrthoFinder].", default='OrthoFinder', required=False)
parser.add_argument('-mc', '--run_coarse_orthofinder', action='store_true', help='Use coarse clustering of homolog groups in OrthoFinder instead of more\nresolute hierarchical determined homolog groups.', required=False, default=False)
parser.add_argument('-a', '--annotate', action='store_true', help='Perform annotation of BGC proteins using KOfam and PGAP (including TIGR)\nHMM profiles.', required=False, default=False)
parser.add_argument('-t', '--run_gtotree', action='store_true', help='Whether to create phylogeny and expected sample-vs-sample\ndivergence for downstream analyses using GToTree.', required=False, default=False)
Expand Down Expand Up @@ -176,7 +176,7 @@ def lsaBGC_Ready():
lsabgc_cluster_synteny = myargs.lsabgc_cluster_synteny

try:
assert (ortholog_method in set(['ORTHOFINDER', 'SONICPARANOID']))
assert (ortholog_method in set(['ORTHOFINDER', 'SONICPARANOID', 'PANAROO']))
except:
sys.stderr.write('Ortholog inference software specified is not a valid option.\n')
sys.exit(1)
Expand Down Expand Up @@ -427,6 +427,12 @@ def lsaBGC_Ready():
# Step 7 - SP: Run SonicParanoid2 with genome-wide predicted proteomes
orthofinder_bgc_matrix_file = util.runSonicParanoid2(final_proteomes_directory, orthofinder_directory, logObject, cpus=cpus)
os.system('mv %s %s' % (orthofinder_bgc_matrix_file, primary_orthofinder_matrix_file))
elif ortholog_method.upper() == 'PANAROO':
# Step 7 - PA: Run Panaroo with genome-wide genbanks
panaroo_input_dir = outdir + 'Panaroo_Input_GFFs/'
util.setupReadyDirectory([orthofinder_directory, panaroo_input_dir])
orthofinder_bgc_matrix_file = util.runPanaroo(final_genbanks_directory, panaroo_input_dir, orthofinder_directory, logObject, cpus=cpus)
os.system('mv %s %s' % (orthofinder_bgc_matrix_file, primary_orthofinder_matrix_file))

prim_samps_with_bgcs = set([])
additional_lines_to_append = []
Expand Down
18 changes: 11 additions & 7 deletions docker/withSonicParanoid/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,16 @@ SHELL ["/bin/bash", "-c"]
# Clone lsaBGC github repo and create conda environment, then create and activate conda environment,
# and install lsaBGC
RUN apt-get update && apt-get install -y git wget libtiff5 gcc && \
git clone https://github.com/Kalan-Lab/lsaBGC && rm -rf lsaBGC/test_case.tar.gz && \
conda install -c conda-forge mamba python=3.9 libarchive && \
mamba env create -f /usr/src/lsaBGC/lsaBGC_env.yml -p /usr/src/lsaBGC_conda_env/ && \
mamba create -p /usr/src/antismash_conda_env/ -c bioconda -c conda-forge -c defaults antismash -y && \
source activate /usr/src/antismash_conda_env/ && download-antismash-databases && \
git clone -b develop-v1.40 https://github.com/Kalan-Lab/lsaBGC && rm -rf lsaBGC/test_case.tar.gz && \
conda install -n base conda-libmamba-solver && conda config --set solver libmamba && \
conda env create -f /usr/src/lsaBGC/lsaBGC_env.yml -p /usr/src/lsaBGC_conda_env/ && \
conda create -p /usr/src/antismash_conda_env/ -c bioconda -c conda-forge -c defaults python=3.11 hmmer2 hmmer diamond fasttree prodigal blast muscle glimmerhmm cxx-compiler meme && \
source activate /usr/src/antismash_conda_env/ && \
wget https://dl.secondarymetabolites.org/releases/7.0.1/antismash-7.0.1.tar.gz && tar -zxf antismash-7.0.1.tar.gz && \
pip install ./antismash-7.0.1 && download-antismash-databases && \
download-antismash-databases && \
conda remove --force meme && conda deactivate && \
conda clean --all -y && conda remove mamba && \
conda clean --all -y && \
echo "source activate /usr/src/lsaBGC_conda_env/" > ~/.bashrc && source ~/.bashrc && \
apt-get clean -y && apt-get autoclean -y && apt-get autoremove -y && \
rm -rf /var/lib/apt/lists/*
Expand All @@ -28,7 +31,7 @@ RUN apt-get update && apt-get install -y git wget libtiff5 gcc && \
WORKDIR /usr/src/lsaBGC/
ENV PATH /usr/src/lsaBGC_conda_env/bin:$PATH
RUN python setup.py install && pip install -e . && setup_annotation_dbs.py -nk -dsh && setup_bigscape.py && \
chmod -R 777 /usr/src/lsaBGC/ && chmod 777 /home
pip install --force-reinstall -v "setuptools==58.2.0" && chmod -R 777 /usr/src/lsaBGC/ && chmod 777 /home
USER 1000:1000
RUN mkdir /home/input /home/output
WORKDIR /home
Expand All @@ -38,3 +41,4 @@ ENV LC_ALL C.UTF-8
ENTRYPOINT ["LSABGC"]
CMD ["--help"]


19 changes: 11 additions & 8 deletions docker/withoutSonicParanoid/Dockerfile
Original file line number Diff line number Diff line change
Expand Up @@ -12,22 +12,24 @@ SHELL ["/bin/bash", "-c"]

# Clone lsaBGC github repo and create conda environment, then create and activate conda environment,
# and install lsaBGC
RUN apt-get update && apt-get install -y git wget && \
git clone https://github.com/Kalan-Lab/lsaBGC && rm -rf lsaBGC/test_case.tar.gz && \
conda install -c conda-forge mamba python=3.9 libarchive && \
mamba env create -f /usr/src/lsaBGC/lsaBGC_env.yml -p /usr/src/lsaBGC_conda_env/ && \
mamba create -p /usr/src/antismash_conda_env/ -c bioconda -c conda-forge -c defaults antismash -y && \
source activate /usr/src/antismash_conda_env/ && download-antismash-databases && \
RUN apt-get update && apt-get install -y git wget libtiff5 gcc && \
git clone -b develop-v1.40 https://github.com/Kalan-Lab/lsaBGC && rm -rf lsaBGC/test_case.tar.gz && \
conda install -n base conda-libmamba-solver && conda config --set solver libmamba && \
conda env create -f /usr/src/lsaBGC/lsaBGC_env.yml -p /usr/src/lsaBGC_conda_env/ && \
conda create -p /usr/src/antismash_conda_env/ -c bioconda -c conda-forge -c defaults python=3.11 hmmer2 hmmer diamond fasttree prodigal blast muscle glimmerhmm cxx-compiler meme && \
source activate /usr/src/antismash_conda_env/ && \
wget https://dl.secondarymetabolites.org/releases/7.0.1/antismash-7.0.1.tar.gz && tar -zxf antismash-7.0.1.tar.gz && \
pip install ./antismash-7.0.1 && download-antismash-databases && \
download-antismash-databases && \
conda remove --force meme && conda deactivate && \
conda clean --all -y && conda remove mamba && \
conda clean --all -y && \
echo "source activate /usr/src/lsaBGC_conda_env/" > ~/.bashrc && source ~/.bashrc && \
apt-get clean -y && apt-get autoclean -y && apt-get autoremove -y && \
rm -rf /var/lib/apt/lists/*

# Install lsaBGC
WORKDIR /usr/src/lsaBGC/
ENV PATH /usr/src/lsaBGC_conda_env/bin:$PATH
# lsaBGC's setup.py should be modified to not include code for pip installation and setup of sonicparanoid
RUN python setup.py install && pip install -e . && setup_annotation_dbs.py -nk -dsh && setup_bigscape.py && \
chmod -R 777 /usr/src/lsaBGC/ && chmod 777 /home
USER 1000:1000
Expand All @@ -39,3 +41,4 @@ ENV LC_ALL C.UTF-8
ENTRYPOINT ["LSABGC"]
CMD ["--help"]


178 changes: 156 additions & 22 deletions lsaBGC/util.py
Original file line number Diff line number Diff line change
Expand Up @@ -1707,6 +1707,26 @@ def performKOFamAndPGAPAnnotation(sample_bgc_proteins, bgc_prot_directory, annot
raise RuntimeError(traceback.format_exc())
return dict(sample_protein_annotations)

def determineAsofName(asof_index):
asof_index_str = str(asof_index)
asof_name = None
if len(asof_index_str) == 1:
asof_name = '000000' + asof_index_str
elif len(asof_index_str) == 2:
asof_name = '00000' + asof_index_str
elif len(asof_index_str) == 3:
asof_name = '0000' + asof_index_str
elif len(asof_index_str) == 4:
asof_name = '000' + asof_index_str
elif len(asof_index_str) == 5:
asof_name = '00' + asof_index_str
elif len(asof_index_str) == 6:
asof_name = '0' + asof_index_str
else:
asoof_name = asof_index_str
assert(asof_name != None)
return(asof_name)

def runOrthoFinder2Full(bgc_prot_directory, orthofinder_outdir, logObject, cpus=1):
result_file = orthofinder_outdir + 'Final_Orthogroups.tsv'
try:
Expand All @@ -1724,9 +1744,69 @@ def runOrthoFinder2Full(bgc_prot_directory, orthofinder_outdir, logObject, cpus=
main_file = orthofinder_outdir + 'Orthogroups/Orthogroups.tsv'
singletons_file = orthofinder_outdir + 'Orthogroups/Orthogroups_UnassignedGenes.tsv'
n0_file = orthofinder_outdir + 'Phylogenetic_Hierarchical_Orthogroups/N0.tsv'
putative_xenologs_dir = orthofinder_outdir + 'Putative_Xenologs/'
phylo_misplaced_genes_dir = orthofinder_outdir + 'Phylogenetically_Misplaced_Genes/'

result_handle = open(result_file, 'w')

gene_to_hog = {}
gene_to_og = {}
with open(n0_file) as n0f:
for i, line in enumerate(n0f):
line = line.strip('\n')
ls = line.split('\t')
if i == 0:
genomes = ls[3:]
result_handle.write('Orthogroup\t' + '\t'.join(genomes) + '\n')
else:
hog = ls[0].split('N0.')[1]
og = ls[1]
for j, gs in enumerate(ls[3:]):
genome = genomes[j]
for gene in gs.split(', '):
gene = gene.strip()
gene_to_hog[gene] = hog
gene_to_og[gene] = og

genome_misplaced_genes = defaultdict(set)
fg_to_genome = {}
for f in os.listdir(phylo_misplaced_genes_dir):
genome = f.split('.txt')[0]
with open(phylo_misplaced_genes_dir + f) as opmgdf:
for i, line in enumerate(opmgdf):
line = line.strip()
genome_misplaced_genes[genome].add(line)
fg_to_genome[line] = genome

close_hogs = defaultdict(lambda: defaultdict(int))
for f in os.listdir(putative_xenologs_dir):
genome = f.split('.tsv')[0]
with open(putative_xenologs_dir + f) as opxf:
for i, line in enumerate(opxf):
if i == 0: continue
line = line.strip()
og, focal_genes, other_genes = line.split('\t')
for fg in focal_genes.split(', '):
if not fg in genome_misplaced_genes[genome]: continue
for otg in other_genes.split(', '):
if otg in gene_to_hog:
close_hogs[fg][gene_to_hog[otg]] += 1

hog_missing_to_add = defaultdict(lambda: defaultdict(set))
for fg in close_hogs:
max_value = max(close_hogs[fg].values())
if max_value == 0: continue
top_hits = 0
top_hog = None
for hog in close_hogs[fg]:
if close_hogs[fg][hog] == max_value:
top_hog = hog
top_hits += 1
if top_hits == 1:
hog_missing_to_add[top_hog][genome].add(fg)

genomes = []
og_genes_in_hog = defaultdict(set)
genome_genes_accounted = defaultdict(set)
result_handle = open(result_file, 'w')
with open(n0_file) as n0f:
for i, line in enumerate(n0f):
Expand All @@ -1738,15 +1818,32 @@ def runOrthoFinder2Full(bgc_prot_directory, orthofinder_outdir, logObject, cpus=
else:
hog = ls[0].split('N0.')[1]
og = ls[1]
printlist = []
for j, gs in enumerate(ls[3:]):
genome = genomes[j]
gss = set(gs.split(', '))
gss_with_missing = gss.union(hog_missing_to_add[hog][genome])
printlist.append(', '.join(sorted(gss_with_missing)))
for gene in gss_with_missing:
genome_genes_accounted[genome].add(gene)
result_handle.write(hog + '\t' + '\t'.join(printlist) + '\n')

genomes_sf = []
with open(singletons_file) as osf:
for i, line in enumerate(osf):
line = line.strip('\n')
ls = line.split('\t')
if i == 0:
genomes_sf = ls[1:]
else:
for j, gs in enumerate(ls[1:]):
genome = genomes_sf[j]
for gene in gs.split(', '):
gene = gene.strip()
gene_genome_pair = tuple([gene, genome])
og_genes_in_hog[og].add(gene_genome_pair)
result_handle.write(hog + '\t' + '\t'.join(ls[3:]) + '\n')
genome_genes_accounted[genome].add(gene)
result_handle.write(line + '\n')

genomes_mf = []
asof_index = 0
with open(main_file) as omf:
for i, line in enumerate(omf):
line = line.strip('\n')
Expand All @@ -1759,24 +1856,17 @@ def runOrthoFinder2Full(bgc_prot_directory, orthofinder_outdir, logObject, cpus=
value_count = 0
for j, gs in enumerate(ls[1:]):
genome = genomes_mf[j]
updated_gs = []
for gene in gs.split(', '):
gene_genome_pair = tuple([gene, genome])
if not gene_genome_pair in og_genes_in_hog[og]:
updated_gs.append(gene)
value_count += 1
printlist.append(', '.join(updated_gs))
result_handle.write('\t'.join(printlist) + '\n')

genomes_sf = []
with open(singletons_file) as osf:
for i, line in enumerate(osf):
line = line.strip('\n')
ls = line.split('\t')
if i == 0:
genomes_sf = ls[1:]
else:
result_handle.write(line + '\n')
if not gene in genome_genes_accounted[genome]:
asof_name = 'ASOF' + determineAsofName(asof_index)
printlist = [asof_name]
for gen in genomes_mf:
if gen == genome:
printlist.append(gene)
else:
printlist.append('')
result_handle.write('\t'.join(printlist) + '\n')
asof_index += 1
result_handle.close()

assert (genomes == genomes_mf and genomes == genomes_sf)
Expand Down Expand Up @@ -1888,6 +1978,50 @@ def runSonicParanoid2(bgc_prot_directory, sonicparanoid_outdir, logObject, cpus=
raise RuntimeError(traceback.format_exc())
return result_file

def runPanaroo(genbanks_directory, panaroo_input_dir, results_directory, logObject, cpus=1, panaroo_options='--clean-mode moderate --remove-invalid-genes'):
result_file = results_directory + 'Final_Orthogroups.tsv'
try:
reformat_cmds = []
panaroo_inputs = []
for f in os.listdir(genbanks_directory):
inf = genbanks_directory + f
outf = panaroo_input_dir + '.'.join(f.split('.')[:-1]) + '.gff'
reformat_cmd = ['genbankToProkkaGFF.py', '-i', inf, '-o', outf, logObject]
reformat_cmds.append(reformat_cmd)
panaroo_inputs.append(outf)

p = multiprocessing.Pool(cpus)
p.map(multiProcess, reformat_cmds)
p.close()

panaroo_cmd = ['panaroo', '-t', str(cpus), panaroo_options, '-i', ' '.join(panaroo_inputs), '-o', results_directory]

logObject.info('Running the following command: %s' % ' '.join(panaroo_cmd))
subprocess.call(' '.join(panaroo_cmd), shell=True, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL,
executable='/bin/bash')
logObject.info('Successfully ran Panaroo!')

main_ortho_file = results_directory + 'gene_presence_absence.csv'

result_handle = open(result_file, 'w')
with open(main_ortho_file) as omof:
for i, line in enumerate(omof):
line = line.strip()
ls = line.split(',')
if i == 0:
result_handle.write('\t'.join(['OrthoGroup'] + ls[3:]) + '\n')
else:
og_id = 'OG' + determineAsofName(i)
result_handle.write('\t'.join([og_id] + [', '.join(x.split(';')) for x in ls[3:]]) + '\n')
result_handle.close()

assert (os.path.isfile(result_file))
except Exception as e:
logObject.error("Problem with running Panaroo cmd: %s." % ' '.join(panaroo_cmd))
logObject.error(traceback.format_exc())
raise RuntimeError(traceback.format_exc())
return result_file

def determineParalogyThresholds(orthofinder_bgc_matrix_file, bgc_prot_directory, blast_directory, logObject, cpus=1):
paralogy_thresholds = defaultdict(
lambda: [90.0, 90.0]) # item 1 : percent identiy threshold; item 2 : query coverage threshold
Expand Down
3 changes: 2 additions & 1 deletion lsaBGC_env.yml
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,10 @@ dependencies:
- r-data.table
- r-plyr
- r-gridextra
- panaroo
- bioconda::orthofinder=2.5.4
- bioconda::prodigal
- bioconda::pyrodigal
- bioconda::pyrodigal=2.3.0
- conda-forge::tar
- conda-forge::pandas=1.4.2
- conda-forge::pomegranate=0.13.3
Expand Down
Loading