From c5f4732684140f5a97e56943f09d95e18012a3fd Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Fri, 10 May 2024 16:05:01 -0500 Subject: [PATCH 1/4] updated blast build class --- locidex/build.py | 107 ++++++++-------- locidex/classes/blast.py | 259 ++++++++++++++++++++++---------------- locidex/classes/blast2.py | 167 ------------------------ locidex/extract.py | 5 +- locidex/search.py | 3 +- tests/test_blast.py | 88 +++++-------- tests/test_blast2.py | 40 ------ tests/test_extractor.py | 43 +++++-- 8 files changed, 269 insertions(+), 443 deletions(-) delete mode 100644 locidex/classes/blast2.py delete mode 100644 tests/test_blast2.py diff --git a/locidex/build.py b/locidex/build.py index 3df3fa2..7955fa1 100644 --- a/locidex/build.py +++ b/locidex/build.py @@ -2,41 +2,47 @@ from datetime import datetime import pandas as pd import os, sys +from pathlib import Path from argparse import (ArgumentParser, ArgumentDefaultsHelpFormatter, RawDescriptionHelpFormatter) from locidex.version import __version__ from locidex.constants import DBFiles from locidex.classes import run_command from locidex.constants import DBConfig +from locidex.classes.blast import BlastMakeDB +from locidex.manifest import DBData +import getpass + +import logging +import sys + +logger = logging.getLogger(__name__) +logging.basicConfig(filemode=sys.stderr, level=logging.INFO) class locidex_build: input_file = None - outdir = None - blast_dir = None - is_dna = False - is_protein = False - df = None config = {} meta = {} num_seqs = 0 - status = True - messages = [] - def __init__(self, input_file: os.PathLike, outdir: os.PathLike, config: DBConfig, seq_columns={'nucleotide':'dna_seq','protein':'aa_seq'},force=False,parse_seqids=False): + + def __init__(self, input_file: os.PathLike, outdir: os.PathLike, config: DBConfig, seq_columns={'nucleotide':'dna_seq','protein':'aa_seq'},force=False,parse_seqids=False,translation_table: int = 11): self.input_file = input_file + self.translation_table = translation_table self.outdir = outdir self.force = force self.status = self.init_dir(self.outdir) self.parse_seqids = parse_seqids - if not self.status: - return - self.status = self.blast_dir = os.path.join(outdir,'blast') + self.is_dna = False + self.is_protein = False + self.blast_dir = outdir.joinpath("blast") + #self.status = self.blast_dir = os.path.join(outdir,'blast') self.init_dir(self.blast_dir) if not self.status: return - self.df = self.read_data( self.input_file) + self.df = self.read_data(self.input_file) self.config = config self.config.db_num_seqs = len(self.df) @@ -44,37 +50,37 @@ def __init__(self, input_file: os.PathLike, outdir: os.PathLike, config: DBConfi col_name = seq_columns[t] s = self.is_seqtype_present(col_name) if s: - outfile = os.path.join(self.blast_dir, t) - if t == 'nucleotide': + #outfile = os.path.join(self.blast_dir, t) + outfile = self.blast_dir.joinpath(t) + if t == DBData.nucleotide_name(): self.is_dna = True self.config.nucleotide_db_name = t - blast_method = 'nucl' - elif t == 'protein': - + blast_method = DBData.nucleotide_db_type() + elif t == DBData.protein_name(): self.is_protein = True self.config.protein_db_name = t - blast_method = 'prot' - self.create_seq_db(t, col_name, outfile, blast_method) + blast_method = DBData.protein_db_type() + self.init_dir(outfile) + out_fasta = outfile.joinpath("{}.fasta".format(t)) + self.write_seq_file(out_fasta, col_name) + creating_db = BlastMakeDB(input_file=out_fasta, + db_type=blast_method, + parse_seqids=self.parse_seqids, + output_db_path=out_fasta) + creating_db.makeblastdb() self.config.is_nucl = self.is_dna self.config.is_prot = self.is_protein self.get_metadata(self.df,columns_to_exclude=list(seq_columns.values())) - def create_seq_db(self,stype,col_name,outfile,blast_method='nucl'): - self.init_dir(outfile) - f = os.path.join(outfile,"{}.fasta".format(stype)) - self.write_seq_file(f,col_name) - (stdout,stderr) = self.makeblastdb(fasta_file=f,outfile=os.path.join(outfile,stype),blast_method=blast_method) - self.messages.append(stderr) - - def init_dir(self,d): - if os.path.isdir(d) and not self.force: - self.messages.append(f'Error {d} exists, if you would like to overwrite, then specify --force') - return False - - if not os.path.isdir(d): - os.makedirs(d, 0o755) - + def init_dir(self, d: Path): + logger.info("Checking if directory {} exists.".format(d)) + try: + d.mkdir(parents=True, exist_ok=self.force, mode=0o755) + except FileExistsError: + logger.critical("Database file {} already exists. To overwrite please run with --force".format(d)) + sys.exit(17) return True + def read_data(self,f): if os.path.isfile(f) and os.path.getsize(f) > 0: @@ -99,21 +105,6 @@ def write_seq_file(self,f,col_name): oh.write(f">{id}\n{data[id]}\n") oh.close() - def makeblastdb(self,fasta_file,outfile,blast_method): - dbtype = 'nucl' - if blast_method == 'prot': - dbtype = 'prot' - command = ['makeblastdb', - '-in', fasta_file, - '-dbtype', dbtype] - if self.parse_seqids: - command.append('-parse_seqids') - if outfile != None: - command+= ['-out',outfile] - command = " ".join([str(x) for x in command]) - - return run_command(command) - def get_metadata(self,df,columns_to_exclude=['dna_seq','aa_seq']): columns_to_exclude = set(columns_to_exclude) columns_to_include = [] @@ -125,7 +116,7 @@ def get_metadata(self,df,columns_to_exclude=['dna_seq','aa_seq']): "info": { "num_seqs": len(df), "is_cds": "True", - "trans_table": 11, + "trans_table": self.translation_table, "dna_min_len": min(df['dna_min_len'].tolist()), "dna_max_len": max(df['dna_min_len'].tolist()), "dna_min_ident": min(df['dna_min_ident'].tolist()), @@ -137,18 +128,20 @@ def get_metadata(self,df,columns_to_exclude=['dna_seq','aa_seq']): } def add_args(parser=None): - + translation_table_def = 11 + default_version = "1.0.0" if parser is None: parser = ArgumentParser( description="Locidex: Build a locidex database",) parser.add_argument('-i','--input_file', type=str, required=True,help='Input tsv formated for locidex') parser.add_argument('-o', '--outdir', type=str, required=True, help='Output directory to put results') - parser.add_argument('-n', '--name', type=str, required=False, help='DB name',default='Locidex Database') - parser.add_argument('-a', '--author', type=str, required=False, help='Author Name for Locidex Database',default='') - parser.add_argument('-c', '--db_ver', type=str, required=False, help='Version code for locidex db', - default='1.0.0') + parser.add_argument('-n', '--name', type=str, required=True, help='Database name',default='Locidex Database') + parser.add_argument('-a', '--author', type=str, required=False, help='Author Name for Locidex Database. Default: {}'.format(getpass.getuser()),default=getpass.getuser()) + parser.add_argument('-c', '--db_ver', type=str, required=False, help='Version code for locidex db: {}'.format(default_version), + default=default_version) parser.add_argument('-e', '--db_desc',type=str, required=False, help='Version code for locidex db', default='') + parser.add_argument("-t", "--translation_table", type=int, required=False, help="Translation table to use: {}".format(translation_table_def), default=translation_table_def) parser.add_argument('-V', '--version', action='version', version="%(prog)s " + __version__) parser.add_argument('-f', '--force', required=False, help='Overwrite existing directory', action='store_true') @@ -161,7 +154,7 @@ def run(cmd_args=None): if cmd_args is None: parser = add_args() cmd_args = parser.parse_args() - #cmd_args = parse_args() + input_file = cmd_args.input_file outdir = cmd_args.outdir force = cmd_args.force @@ -181,7 +174,7 @@ def run(cmd_args=None): print(f'Error {input_file} does not exist, please check path and try again') sys.exit() - obj = locidex_build(input_file, outdir,config=config,seq_columns={'nucleotide':'dna_seq','protein':'aa_seq'},force=force) + obj = locidex_build(Path(input_file), Path(outdir),config=config,seq_columns={'nucleotide':'dna_seq','protein':'aa_seq'},force=force) if obj.status == False: print(f'Error something went wrong building the db, check error messages {obj.messages}') diff --git a/locidex/classes/blast.py b/locidex/classes/blast.py index 3b01f3d..c08c926 100644 --- a/locidex/classes/blast.py +++ b/locidex/classes/blast.py @@ -1,11 +1,21 @@ +""" +Blast module refactored +""" + import pandas as pd from locidex.classes import run_command from locidex.utils import slots +from locidex.manifest import DBData +from locidex.constants import BlastCommands from dataclasses import dataclass -from typing import Optional -import os +from pathlib import Path +from typing import Optional, List, Dict +import logging +import sys +logger = logging.getLogger(__name__) +logging.basicConfig(filemode=sys.stderr, level=logging.DEBUG) @dataclass class FilterOptions: @@ -14,128 +24,161 @@ class FilterOptions: include: Optional[bool] __slots__ = slots(__annotations__) -class blast_search: - VALID_BLAST_METHODS = ['blastn','tblastn','blastp'] - def __init__(self,input_db_path,input_query_path,output_results,blast_params,blast_method,blast_columns,output_db_path=None,parse_seqids=False): - self.input_query_path = input_query_path - self.input_db_path = input_db_path - self.output_db_path = output_db_path - self.output_results = output_results - self.blast_method = blast_method - self.blast_params = blast_params - #self.create_db = create_db - self.BLAST_TABLE_COLS = blast_columns - self.parse_seqids = parse_seqids - - if self.output_db_path is None: - self.output_db_path = input_db_path - - if not os.path.isfile(self.input_query_path): - raise ValueError(f'Error {self.input_query_path} query fasta does not exist') - elif not self.is_blast_db_valid(): - raise ValueError(f'Error {self.input_db_path} is not a valid blast db') - elif not blast_method in self.VALID_BLAST_METHODS: - raise ValueError(f'Error {blast_method} is not a supported blast method: {self.blast_method}') - - #elif create_db and not self.is_blast_db_valid(): - # (stdout,stderr) = self.makeblastdb() - # print(stdout, stderr) - # if not self.is_blast_db_valid(): - # raise ValueError(f'Error {self.output_db_path} is not a valid blast db and creation of db failed') - # TODO add logging for blast messages - (stdout, stderr) = self.run_blast() - print(stdout, stderr) +class BlastMakeDB: + """ + Create a blast database + """ + def __init__(self, input_file: Path, db_type: str, parse_seqids: bool, output_db_path: Optional[Path]): + self.input_file = input_file + self.db_type = db_type + self.parse_seqids = parse_seqids + self.output_db_path = output_db_path def makeblastdb(self): - dbtype = 'nucl' - if self.blast_method == 'blastp': - dbtype = 'prot' - command = ['makeblastdb', - '-in', self.input_db_path, - '-dbtype', dbtype] + """ + Create a blast datbase output + """ + command = ['makeblastdb', '-in', str(self.input_file), '-dbtype', self.db_type] if self.parse_seqids: command.append('-parse_seqids') if self.output_db_path != None: - command +=['-out', self.output_db_path] - return run_command(" ".join([str(x) for x in command])) - - def is_blast_db_valid(self): - extensions = ['nsq', 'nin', 'nhr'] - for e in extensions: - if not os.path.isfile(f'{self.output_db_path}.{e}'): - extensions2 = ['pto', 'ptf', 'phr'] - for e2 in extensions2: - if not os.path.isfile(f'{self.output_db_path}.{e2}'): - return False - - return True - - def run_blast(self): - command = [ - self.blast_method, - '-query', self.input_query_path, - '-db', self.input_db_path, - '-out', self.output_results, - '-outfmt', "'6 {}'".format(' '.join(self.BLAST_TABLE_COLS)), - ] - for p in self.blast_params: - if p == 'parse_seqids': - command.append(f'-{p}') - else: - command += [f'-{p}', f'{self.blast_params[p]}'] - return run_command(" ".join([str(x) for x in command])) - - -class parse_blast: - input_file = None - columns = [] - filter_options = {} - - def __init__(self, input_file,blast_columns,filter_options): - self.input_file = input_file - self.filter_options = filter_options - self.BLAST_TABLE_COLS = blast_columns - - if not os.path.isfile(self.input_file): - raise FileNotFoundError(f'Error {self.input_file} does not exist') + command +=['-out', str(self.output_db_path)] + mk_db_cmd = " ".join([str(x) for x in command]) + logger.info("Blast database command: {}".format(mk_db_cmd)) + stdout, stderr = run_command(mk_db_cmd) + if stdout: + logger.debug("Blast makedb stdout: {}".format(stdout)) + if stderr: + logger.debug("Blast makedb stderr: {}".format(stderr)) - self.df = self.read_hit_table() - print(self.df) - self.columns = self.df.columns.tolist() + self.checkblastdb(self.output_db_path) + return self.output_db_path + + def checkblastdb(self, blast_db_path): + """ + Collect log information about blast database generated. + """ + command = " ".join(["blastdbcheck", '-dbtype', str(self.db_type), '-db', str(blast_db_path)]) + logger.info("Checking generated blast database with command: {}".format(command)) + stdout, stderr = run_command(command) + if stdout: + logger.debug("Check Blast DB commands: {}".format(stdout)) + if stderr: + logger.debug("Check Blast DB stderr: {}".format(stderr)) + + +class BlastSearch: + __blast_commands = set(BlastCommands._keys()) + __blast_extensions_nt = frozenset(['.nsq', '.nin', '.nhr']) + __blast_extensions_pt = frozenset(['.pto', '.ptf', '.phr']) + __filter_columns = ["qseqid", "sseqid"] + + def __init__(self, db_data: DBData, query_path: Path, blast_params: dict, blast_method: str, blast_columns: List[str], filter_options: Optional[Dict[str, FilterOptions]]=None): + self.db_data = db_data + if blast_method not in self.__blast_commands: + raise ValueError("{} is not a valid blast command please pick from {}".format(blast_method, self.__blast_commands)) + self.query_path = query_path + self.blast_params = blast_params + self.blast_method = blast_method + self.blast_columns = blast_columns + self.filter_options = filter_options - for id_col in ['qseqid','sseqid']: + def get_blast_data(self, db_path: Path, output: Path) -> pd.DataFrame: + """ + Run blast and parse results + TODO need to clean up the db_path hand off from the DBData obj its dirty + """ + + stdout, stderr = self._run_blast(db=db_path, output=output) + if stdout: + logger.debug("Blast search stdout: {}".format(stdout)) + if stderr: + logger.debug("Blast search stderr: {}".format(stderr)) + + blast_data = self.parse_blast(output_file=output) + return blast_data + + def parse_blast(self, output_file: Path): + """ + Parse a blast output file + output_file Path: Generate blast data + """ + df = self.read_hit_table(output_file) + columns = df.columns.tolist() + for id_col in self.__filter_columns: tp = {} - if id_col in self.columns: + if id_col in columns: tp[id_col] = 'object' - self.df = self.df.astype(tp) + df = df.astype(tp) + if self.filter_options is None: + return df + for col_name in self.filter_options: - if col_name in self.columns: - #min_value = self.filter_options[col_name]['min'] - #max_value = self.filter_options[col_name]['max'] - #include = self.filter_options[col_name]['include'] - min_value = self.filter_options[col_name].min - max_value = self.filter_options[col_name].max - include = self.filter_options[col_name].include - self.filter_df(col_name, min_value, max_value,include) - - def read_hit_table(self): - return pd.read_csv(self.input_file,header=None,names=self.BLAST_TABLE_COLS,sep="\t",low_memory=False) - - - def filter_df(self,col_name,min_value,max_value,include): - if col_name not in self.columns: + if col_name in columns: + min_value = self.filter_options[col_name].min + max_value = self.filter_options[col_name].max + include = self.filter_options[col_name].include + df = self.filter_df(df,col_name, min_value, max_value, include, columns) + return df + + + def filter_df(self,df, col_name,min_value,max_value,include, columns): + if col_name not in columns: return False if min_value is not None: - self.df = self.df[self.df[col_name] >= min_value] + df = df[df[col_name] >= min_value] if max_value is not None: - self.df = self.df[self.df[col_name] <= max_value] + df = df[df[col_name] <= max_value] if include is not None: - self.df = self.df[self.df[col_name].isin(include)] - return True - - + df = df[df[col_name].isin(include)] + return df + + def read_hit_table(self, blast_data): + return pd.read_csv(blast_data,header=None,names=self.blast_columns,sep="\t",low_memory=False) + + + def _check_blast_files(self, db_dir: Path, extensions: frozenset): + """ + """ + extensions_ = set([i.suffix for i in db_dir.iterdir()]) + if not extensions_.issuperset(extensions): + raise ValueError("Missing required blast files. {}".format([i for i in extensions_ if i not in extensions])) + + def validate_blast_db(self, db_data=None): + """ + """ + if db_data is None: + db_data = self.db_data + if db_data.nucleotide: + self._check_blast_files(db_data.nucleotide, self.__blast_extensions_nt) + + if db_data.protein: + self._check_blast_files(db_data.protein, self.__blast_extensions_pt) + + def _run_blast(self, db: Path, output: Path): + """ + db PAth: Path to the blast database to use, + output Path: Path to file for blast output + + TODO need to use classes db versoin or not pass it too the initializer + """ + command = [ + self.blast_method, + '-query', self.query_path, + '-db', str(db), + '-out', str(output), + '-outfmt', "'6 {}'".format(' '.join(self.blast_columns)), + ] + for param in self.blast_params: + if param == "parse_seqids": + command.append(f"-{param}") + else: + command += [f'-{param}', f'{self.blast_params[param]}'] + blast_search_cmd = " ".join([str(x) for x in command]) + logger.info("Blast command: {}".format(blast_search_cmd)) + return run_command(blast_search_cmd) diff --git a/locidex/classes/blast2.py b/locidex/classes/blast2.py deleted file mode 100644 index 1344b31..0000000 --- a/locidex/classes/blast2.py +++ /dev/null @@ -1,167 +0,0 @@ -""" -Blast module refactored -""" - -import pandas as pd - -from locidex.classes import run_command -from locidex.utils import slots -from locidex.manifest import DBData -from locidex.constants import BlastCommands -from dataclasses import dataclass -from pathlib import Path -from typing import Optional, List, Dict -import logging -import sys - -logger = logging.getLogger(__name__) -logging.basicConfig(filemode=sys.stderr, level=logging.INFO) - -@dataclass -class FilterOptions: - min: Optional[float] - max: Optional[float] - include: Optional[bool] - __slots__ = slots(__annotations__) - - -class BlastMakeDB: - """ - Create a blast database - """ - - def __init__(self, input_file: Path, db_type: str, parse_seqids: bool, output_db_path: Optional[Path]): - self.input_file = input_file - self.db_type = db_type - self.parse_seqids = parse_seqids - self.output_db_path = output_db_path - - def makeblastdb(self): - command = ['makeblastdb', '-in', str(self.input_file), '-dbtype', self.db_type] - if self.parse_seqids: - command.append('-parse_seqids') - if self.output_db_path != None: - command +=['-out', str(self.output_db_path)] - mk_db_cmd = " ".join([str(x) for x in command]) - logger.info("Blast database command: {}".format(mk_db_cmd)) - stdout, stderr = run_command(mk_db_cmd) - if stdout: - logger.info("Blast makedb stdout: {}".format(stdout)) - if stderr: - logger.info("Blast makedb stderr: {}".format(stderr)) - return self.output_db_path - -class BlastSearch: - __blast_commands = set(BlastCommands._keys()) - __blast_extensions_nt = frozenset(['.nsq', '.nin', '.nhr']) - __blast_extensions_pt = frozenset(['.pto', '.ptf', '.phr']) - __filter_columns = ["qseqid", "sseqid"] - - def __init__(self, db_data: DBData, query_path: Path, blast_params: dict, blast_method: str, blast_columns: List[str], filter_options: Optional[Dict[str, FilterOptions]]=None): - self.db_data = db_data - if blast_method not in self.__blast_commands: - raise ValueError("{} is not a valid blast command please pick from {}".format(blast_method, self.__blast_commands)) - self.query_path = query_path - self.blast_params = blast_params - self.blast_method = blast_method - self.blast_columns = blast_columns - self.filter_options = filter_options - - def get_blast_data(self, db_path: Path, output: Path) -> pd.DataFrame: - """ - Run blast and parse results - TODO need to clean up the db_path hand off from the DBData obj its dirty - """ - - stdout, stderr = self._run_blast(db=db_path, output=output) - if stdout: - logger.info("Blast search stdout: {}".format(stdout)) - if stderr: - logger.info("Blast search stderr: {}".format(stderr)) - - blast_data = self.parse_blast(output_file=output) - return blast_data - - def parse_blast(self, output_file: Path): - """ - Parse a blast output file - output_file Path: Generate blast data - """ - df = self.read_hit_table(output_file) - columns = df.columns.tolist() - for id_col in self.__filter_columns: - tp = {} - if id_col in columns: - tp[id_col] = 'object' - df = df.astype(tp) - if self.filter_options is None: - return df - - for col_name in self.filter_options: - if col_name in columns: - min_value = self.filter_options[col_name].min - max_value = self.filter_options[col_name].max - include = self.filter_options[col_name].include - df = self.filter_df(df,col_name, min_value, max_value, include, columns) - return df - - - def filter_df(self,df, col_name,min_value,max_value,include, columns): - if col_name not in columns: - return False - if min_value is not None: - df = df[df[col_name] >= min_value] - if max_value is not None: - df = df[df[col_name] <= max_value] - if include is not None: - df = df[df[col_name].isin(include)] - return df - - def read_hit_table(self, blast_data): - return pd.read_csv(blast_data,header=None,names=self.blast_columns,sep="\t",low_memory=False) - - - def _check_blast_files(self, db_dir: Path, extensions: frozenset): - """ - """ - extensions_ = set([i.suffix for i in db_dir.iterdir()]) - if not extensions_.issuperset(extensions): - raise ValueError("Missing required blast files. {}".format([i for i in extensions_ if i not in extensions])) - - def validate_blast_db(self, db_data=None): - """ - """ - if db_data is None: - db_data = self.db_data - if db_data.nucleotide: - self._check_blast_files(db_data.nucleotide, self.__blast_extensions_nt) - - if db_data.protein: - self._check_blast_files(db_data.protein, self.__blast_extensions_pt) - - def _run_blast(self, db: Path, output: Path): - """ - db PAth: Path to the blast database to use, - output Path: Path to file for blast output - - TODO need to use classes db versoin or not pass it too the initializer - """ - command = [ - self.blast_method, - '-query', self.query_path, - '-db', str(db), - '-out', str(output), - '-outfmt', "'6 {}'".format(' '.join(self.blast_columns)), - ] - for param in self.blast_params: - if param == "parse_seqids": - command.append(f"-{param}") - else: - command += [f'-{param}', f'{self.blast_params[param]}'] - blast_search_cmd = " ".join([str(x) for x in command]) - logger.info("Blast command: {}".format(blast_search_cmd)) - return run_command(blast_search_cmd) - - - - diff --git a/locidex/extract.py b/locidex/extract.py index ef3e059..f830ee4 100644 --- a/locidex/extract.py +++ b/locidex/extract.py @@ -9,8 +9,8 @@ import numpy as np import pandas as pd from locidex.classes.extractor import extractor -#from locidex.classes.blast import blast_search, parse_blast -from locidex.classes.blast2 import BlastSearch, FilterOptions, BlastMakeDB + +from locidex.classes.blast import BlastSearch, FilterOptions, BlastMakeDB from locidex.manifest import DBData from locidex.classes.db import search_db_conf, db_config from locidex.classes.seq_intake import seq_intake, seq_store @@ -156,6 +156,7 @@ def run_extract(config): seq_data[str(idx)] = {'id':str(seq.seq_id),'seq':seq.dna_seq} oh.write(">{}\n{}\n".format(idx,seq.dna_seq)) del(seq_obj) + #TODO this should probably work on more than just nucleotides contigs_db = BlastMakeDB(contigs_path, DBData.nucleotide_db_type(), True, contigs_path) contigs_db.makeblastdb() diff --git a/locidex/search.py b/locidex/search.py index 933df56..2b4bd0b 100644 --- a/locidex/search.py +++ b/locidex/search.py @@ -10,8 +10,7 @@ import pandas as pd from functools import partial -#from locidex.classes.blast import blast_search, parse_blast, FilterOptions -from locidex.classes.blast2 import BlastSearch, FilterOptions +from locidex.classes.blast import BlastSearch, FilterOptions from locidex.classes.db import search_db_conf, db_config from locidex.manifest import DBData from locidex.classes.seq_intake import seq_intake, seq_store, HitFilters diff --git a/tests/test_blast.py b/tests/test_blast.py index 4ae8998..59ccd1d 100644 --- a/tests/test_blast.py +++ b/tests/test_blast.py @@ -1,60 +1,40 @@ -import pytest, os -import pandas as pd -import locidex.classes.blast -import shutil -from locidex.classes.blast import FilterOptions -from locidex.constants import BlastColumns - +import pytest +import os +import locidex +from pathlib import Path +from locidex.manifest import DBData +from locidex.classes import blast +from locidex.constants import BlastCommands, BlastColumns PACKAGE_ROOT = os.path.dirname(locidex.__file__) + +@pytest.fixture() +def db_data(): + db_dir = DBData(Path(PACKAGE_ROOT).joinpath("example", "build_db_mlst_out")) + return db_dir + @pytest.fixture() -def blast_search_class_init(tmpdir): - test_dir = tmpdir - blast_search_obj = locidex.classes.blast.blast_search(input_db_path=os.path.join(PACKAGE_ROOT, "example/build_db_mlst_out/blast/nucleotide/nucleotide"), - input_query_path=os.path.join(PACKAGE_ROOT, 'example/search/NC_003198.1.fasta'), - output_results=os.path.join(test_dir,"hsps.txt"), - blast_params={'evalue': 0.0001,'max_target_seqs': 10,'num_threads': 1}, - blast_method='blastn', - blast_columns=BlastColumns._fields) - #blast_search_obj.run_blast() - return test_dir, blast_search_obj - - -#def test_make_mlst_database(blast_search_class_init): -# test_dir, obj = blast_search_class_init -# #print(test_dir) -# #blast_search_obj.input_db_path = os.path.join(PACKAGE_ROOT, "example/build_db_mlst_out/blast/nucleotide/nucleotide.fasta") -# #blast_search_obj.input_query_path = os.path.join(PACKAGE_ROOT, 'example/search/NC_003198.1.fasta') -# #blast_search_obj.output_db_path = os.path.join(tmpdir,"nucleotide_mlst_database") -# #blast_search_obj.makeblastdb(); -# -# assert len([file for file in os.listdir(test_dir) if "nucleotide_mlst_database" in file]) > 0 -# -# #blast_search_obj.input_db_path = os.path.join(test_dir, "nucleotide_mlst_database") # assign new path to freshly created database to check its validity -# #assert blast_search_obj.is_blast_db_valid() == True - - - -def test_run_blast_on_genome_and_check_output(blast_search_class_init): - #test_make_mlst_database(blast_search_class_init,tmpdir) - blast_search_obj, obj = blast_search_class_init - #blast_search_obj.input_db_path = os.path.join(tmpdir, "nucleotide_mlst_database") - #obj.run_blast() - #output_blast_results_path = os.path.join(tmpdir,"hsps.txt") - output_blast_results_path = os.path.join(blast_search_obj, "hsps.txt") - assert os.path.exists(output_blast_results_path) == True - print(output_blast_results_path) - - with open(output_blast_results_path, "r") as fp: - output_blast_results_file = fp.readlines() - assert len(output_blast_results_file) == 10 - print(output_blast_results_path) - parse_blast_obj = locidex.classes.blast.parse_blast(input_file = output_blast_results_path, - blast_columns = BlastColumns._fields, - filter_options={'bitscore': FilterOptions(min=600, max=None, include=None)}) - assert parse_blast_obj.df.shape[0] == 7 - assert parse_blast_obj.df['bitscore'].max() == 926 - assert len([item for item in parse_blast_obj.df.columns.to_list() if item not in BlastColumns._fields]) == 0 #check if columns in df and constant are identical +def fasta(): + return Path(PACKAGE_ROOT).joinpath("example", "search", "NC_003198.1.fasta") + +def test_validate_blast_db(db_data): + + test_class = blast.BlastSearch(db_data, Path("home"), dict(), BlastCommands.blastn, BlastColumns._fields, dict()) + test_class.validate_blast_db() + + +def test_blast_runs(db_data, fasta, tmpdir): + test_class = blast.BlastSearch(db_data, fasta, dict(), BlastCommands.blastn, BlastColumns._fields, dict()) + out_file = "out.txt" + stdout, stderr = test_class._run_blast(db_data.nucleotide_blast_db, tmpdir / out_file) + with open(tmpdir / out_file, "r") as fp: + assert len(fp.readlines()) == 30 + +def test_blast_runs(db_data, fasta, tmpdir): + test_class = blast.BlastSearch(db_data, fasta, dict(), BlastCommands.blastn, BlastColumns._fields, dict()) + out_file = tmpdir / "out.txt" + bd = test_class.get_blast_data(db_data.nucleotide_blast_db, out_file) + assert len(bd) == 30 \ No newline at end of file diff --git a/tests/test_blast2.py b/tests/test_blast2.py deleted file mode 100644 index 4663c82..0000000 --- a/tests/test_blast2.py +++ /dev/null @@ -1,40 +0,0 @@ -import pytest -import os -import locidex -from pathlib import Path -from locidex.manifest import DBData -from locidex.classes import blast2 -from locidex.constants import BlastCommands, BlastColumns - - -PACKAGE_ROOT = os.path.dirname(locidex.__file__) - - -@pytest.fixture() -def db_data(): - db_dir = DBData(Path(PACKAGE_ROOT).joinpath("example", "build_db_mlst_out")) - return db_dir - -@pytest.fixture() -def fasta(): - return Path(PACKAGE_ROOT).joinpath("example", "search", "NC_003198.1.fasta") - -def test_validate_blast_db(db_data): - - test_class = blast2.BlastSearch(db_data, Path("home"), dict(), BlastCommands.blastn, BlastColumns._fields, dict()) - test_class.validate_blast_db() - - -def test_blast_runs(db_data, fasta, tmpdir): - test_class = blast2.BlastSearch(db_data, fasta, dict(), BlastCommands.blastn, BlastColumns._fields, dict()) - out_file = "out.txt" - stdout, stderr = test_class._run_blast(db_data.nucleotide_blast_db, tmpdir / out_file) - with open(tmpdir / out_file, "r") as fp: - assert len(fp.readlines()) == 30 - - -def test_blast_runs(db_data, fasta, tmpdir): - test_class = blast2.BlastSearch(db_data, fasta, dict(), BlastCommands.blastn, BlastColumns._fields, dict()) - out_file = tmpdir / "out.txt" - bd = test_class.get_blast_data(db_data.nucleotide_blast_db, out_file) - assert len(bd) == 30 \ No newline at end of file diff --git a/tests/test_extractor.py b/tests/test_extractor.py index 9b5afbd..3d74f9b 100644 --- a/tests/test_extractor.py +++ b/tests/test_extractor.py @@ -1,8 +1,8 @@ import pytest import os import locidex -from locidex.classes.blast import blast_search, parse_blast -from locidex.classes.blast2 import FilterOptions, BlastSearch, BlastMakeDB +#from locidex.classes.blast import blast_search, parse_blast +from locidex.classes.blast import FilterOptions, BlastSearch, BlastMakeDB from locidex.constants import BlastColumns, BlastCommands from locidex.classes.extractor import extractor from locidex.classes.seq_intake import seq_intake @@ -17,17 +17,34 @@ def blast_db_and_search(tmpdir,input_db_path): - blast_search_obj = blast_search(input_db_path=input_db_path, - input_query_path=os.path.join(PACKAGE_ROOT, 'example/build_db_mlst_out/blast/nucleotide/nucleotide.fasta'), - output_results=os.path.join(tmpdir,"hsps.txt"), blast_params={'evalue': 0.0001,'max_target_seqs': 10,'num_threads': 1}, - blast_method='blastn', - blast_columns=BlastColumns._fields,create_db=True) - blast_search_obj.run_blast() - output_blast_results_path = os.path.join(tmpdir,"hsps.txt") - parse_blast_obj = parse_blast(input_file = output_blast_results_path, - blast_columns = BlastColumns._fields, - filter_options={'bitscore':{'min':600, 'max':None, 'include':None}}) - return parse_blast_obj + #blast_search_obj = blast_search(input_db_path=input_db_path, + # input_query_path=os.path.join(PACKAGE_ROOT, 'example/build_db_mlst_out/blast/nucleotide/nucleotide.fasta'), + # output_results=os.path.join(tmpdir,"hsps.txt"), blast_params={'evalue': 0.0001,'max_target_seqs': 10,'num_threads': 1}, + # blast_method='blastn', + # blast_columns=BlastColumns._fields,create_db=True) + #blast_search_obj.run_blast() + #output_blast_results_path = os.path.join(tmpdir,"hsps.txt") + #parse_blast_obj = parse_blast(input_file = output_blast_results_path, + # blast_columns = BlastColumns._fields, + # filter_options={'bitscore':{'min':600, 'max':None, 'include':None}}) + + blast_search_obj = BlastMakeDB( + db_data=Path(input_db_path), + db_type=DBData.nucleotide_db_type(), + parse_seqids=True, + output_db_path=Path(input_db_path)) + blast_db_path = blast_search_obj.makeblastdb() + + blast_search_obj = BlastSearch(db_data=blast_db_path, + query_path=Path(os.path.join(PACKAGE_ROOT, 'example/build_db_mlst_out/blast/nucleotide/nucleotide.fasta')), + blast_params={'evalue': 0.0001,'max_target_seqs': 10,'num_threads': 1}, + blast_method=BlastCommands.blastn, + blast_columns=BlastColumns._fields, + filter_options={'bitscore':FilterOptions(**{'min':600, 'max':None, 'include':None})}) + + parse_blast_obj = blast_search_obj.get_blast_data(db_path=blast_db_path, output=tmpdir.joinpath("hsps.txt")) + + #return parse_blast_obj @pytest.fixture def seq_intake_fixture(): From a6b7b3c8090577b839bad0ca35f6b4b04b6b1aeb Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Fri, 10 May 2024 16:19:33 -0500 Subject: [PATCH 2/4] Created metadata info class --- locidex/build.py | 33 ++++++++++++--------------------- locidex/constants.py | 16 ++++++++++++++++ locidex/extract.py | 2 +- 3 files changed, 29 insertions(+), 22 deletions(-) diff --git a/locidex/build.py b/locidex/build.py index 7955fa1..0638d18 100644 --- a/locidex/build.py +++ b/locidex/build.py @@ -7,7 +7,7 @@ from locidex.version import __version__ from locidex.constants import DBFiles from locidex.classes import run_command -from locidex.constants import DBConfig +from locidex.constants import DBConfig, MetadataFields from locidex.classes.blast import BlastMakeDB from locidex.manifest import DBData import getpass @@ -19,11 +19,6 @@ logging.basicConfig(filemode=sys.stderr, level=logging.INFO) class locidex_build: - input_file = None - config = {} - meta = {} - num_seqs = 0 - def __init__(self, input_file: os.PathLike, outdir: os.PathLike, config: DBConfig, seq_columns={'nucleotide':'dna_seq','protein':'aa_seq'},force=False,parse_seqids=False,translation_table: int = 11): @@ -37,10 +32,8 @@ def __init__(self, input_file: os.PathLike, outdir: os.PathLike, config: DBConfi self.is_dna = False self.is_protein = False self.blast_dir = outdir.joinpath("blast") - #self.status = self.blast_dir = os.path.join(outdir,'blast') + self.init_dir(self.blast_dir) - if not self.status: - return self.df = self.read_data(self.input_file) self.config = config @@ -50,7 +43,6 @@ def __init__(self, input_file: os.PathLike, outdir: os.PathLike, config: DBConfi col_name = seq_columns[t] s = self.is_seqtype_present(col_name) if s: - #outfile = os.path.join(self.blast_dir, t) outfile = self.blast_dir.joinpath(t) if t == DBData.nucleotide_name(): self.is_dna = True @@ -113,17 +105,16 @@ def get_metadata(self,df,columns_to_exclude=['dna_seq','aa_seq']): columns_to_include.append(col) subset = df[columns_to_include] self.meta = { - "info": { - "num_seqs": len(df), - "is_cds": "True", - "trans_table": self.translation_table, - "dna_min_len": min(df['dna_min_len'].tolist()), - "dna_max_len": max(df['dna_min_len'].tolist()), - "dna_min_ident": min(df['dna_min_ident'].tolist()), - "aa_min_len": min(df['aa_min_len'].tolist()), - "aa_max_len": max(df['aa_min_len'].tolist()), - "aa_min_ident": min(df['aa_min_ident'].tolist()), - }, + "info": MetadataFields( + num_seqs=len(df), + is_cds=True, + trans_table=self.translation_table, + dna_min_len=min(df['dna_min_len'].tolist()), + dna_max_len=max(df['dna_min_len'].tolist()), + dna_min_ident=min(df['dna_min_ident'].tolist()), + aa_min_len=min(df['aa_min_len'].tolist()), + aa_max_len= max(df['aa_min_len'].tolist()), + aa_min_ident= min(df['aa_min_ident'].tolist())).to_dict(), "meta": subset.to_dict(orient='index') } diff --git a/locidex/constants.py b/locidex/constants.py index 6044e92..41e89ef 100644 --- a/locidex/constants.py +++ b/locidex/constants.py @@ -161,6 +161,22 @@ class ManifestFields: 'protein':'protein.fasta', } +@dataclass +class MetadataFields: + num_seqs: int + is_cds: bool + trans_table: int + dna_min_len: int + dna_max_len: int + dna_min_ident: float + aa_min_len: int + aa_max_len: int + aa_min_ident: float + + def to_dict(self): + return asdict(self) + + class LocidexDBHeader(NamedTuple): seq_id: str diff --git a/locidex/extract.py b/locidex/extract.py index f830ee4..0215412 100644 --- a/locidex/extract.py +++ b/locidex/extract.py @@ -165,7 +165,7 @@ def run_extract(config): if not os.path.isdir(blast_dir_base): os.makedirs(blast_dir_base, 0o755) - blast_database_paths = db_database_config.blast_paths + #blast_database_paths = db_database_config.blast_paths blast_params = { 'evalue': min_evalue, From 79fc1524113c078c54cec9314fbe05a3fed7df11 Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Fri, 10 May 2024 16:21:15 -0500 Subject: [PATCH 3/4] updated extract messages --- locidex/extract.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/locidex/extract.py b/locidex/extract.py index 0215412..2edc862 100644 --- a/locidex/extract.py +++ b/locidex/extract.py @@ -61,7 +61,7 @@ def add_args(parser=None): help='Format of query file [genbank,fasta]') parser.add_argument('--translation_table', type=int, required=False, help='output directory', default=11) - parser.add_argument('-a', '--annotate', required=False, help='Perform annotation on unannotated input fasta', + parser.add_argument('-a', '--annotate', required=False, help='Perform annotation on unannotated input fasta (Do not use if you are taking in the output of extract)', action='store_true') parser.add_argument('-V', '--version', action='version', version="%(prog)s " + __version__) parser.add_argument('-f', '--force', required=False, help='Overwrite existing directory', From 99d16f2c346089422c8e009cc2adba2eef9d35b3 Mon Sep 17 00:00:00 2001 From: Matthew Wells Date: Fri, 10 May 2024 16:27:54 -0500 Subject: [PATCH 4/4] Updated string constants --- locidex/build.py | 3 ++- locidex/constants.py | 2 ++ 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/locidex/build.py b/locidex/build.py index 0638d18..df591cf 100644 --- a/locidex/build.py +++ b/locidex/build.py @@ -21,7 +21,8 @@ class locidex_build: - def __init__(self, input_file: os.PathLike, outdir: os.PathLike, config: DBConfig, seq_columns={'nucleotide':'dna_seq','protein':'aa_seq'},force=False,parse_seqids=False,translation_table: int = 11): + def __init__(self, input_file: os.PathLike, outdir: os.PathLike, config: DBConfig, + seq_columns={DBData.nucleotide_name():'dna_seq',DBData.protein_name():'aa_seq'},force=False,parse_seqids=False,translation_table: int = 11): self.input_file = input_file self.translation_table = translation_table self.outdir = outdir diff --git a/locidex/constants.py b/locidex/constants.py index 41e89ef..a3a8ca3 100644 --- a/locidex/constants.py +++ b/locidex/constants.py @@ -22,6 +22,7 @@ class CharacterConstants: stop_codon: str = "*" + #BLAST_TABLE_COLS = ''' #qseqid #sseqid @@ -201,3 +202,4 @@ class LocidexDBHeader(NamedTuple): min_aa_match_cov: Optional[int] count_int_stops: int dna_ambig_count: int +