-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathretrieve_env_aa_seqs.py
56 lines (41 loc) · 1.66 KB
/
retrieve_env_aa_seqs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
import csv
import re
import sys
from hiv_seq_utils import (read_fasta, safe_fetch_env_aa)
# Create a CSV file with the gp160 amino acid sequences of all the entries in
# the LANL filtered envelope dataset (about 6500 accession numbers) as described
# in https://doi.org/10.1093/jac/dkab257.
# This program has to be run just once. It takes several hours because GenBank
# responds slowly.
# The file does not contain HXB2 because that will be used as the reference
# sequence in Env_align.py
# Of note, many AA sequences are unable to be returned for various reasons
# Further optimization of the program is required to better control what AAs
# are identified.
# Overall, 5862 sequences of lengths 462 to 901 were returned. These included
# 50 sequences between 462 and 809 AAs.
# 750 were missing; 44 were shorter than 450 AAs; These were deleted from the
# the excel file but left in the CSV file.
lanl_filtered_acc = "RefData/LANL_HIV1_FLT_2021_accessions.txt"
acc_numbers = []
with open(lanl_filtered_acc, 'r') as file:
for line in file:
acc_number = line.strip()
acc_numbers.append(acc_number)
# Get the envelope amino acid sequences
gp160_aa_sequences = []
counter=0
for acc in acc_numbers:
counter += 1
if counter % 100 == 0:
print(counter)
(protein_id, aas) = safe_fetch_env_aa(acc)
if protein_id is not None and aas is not None:
gp160_aa_sequences.append(f"{acc},{len(aas)},{aas}\n")
else:
gp160_aa_sequences.append(f"{acc},-,-\n" )
print(acc)
# Write the entries to a text file
full_text = ''.join(gp160_aa_sequences)
with open('gp160_aa_sequences.csv', 'w') as file:
file.write(full_text)