@@ -85,12 +85,21 @@ def parse_args(args=None):
85
85
)
86
86
return parser .parse_args (args )
87
87
88
+ def check_formatted_tree (tree_string ):
89
+ """Check if formatted tree has duplicate nodes"""
90
+
91
+ pattern = r'([a-zA-Z]+\w{3,}):.*\1'
92
+ match = re .search (pattern , tree_string )
93
+
94
+ return bool (match )
88
95
89
96
def read_tree (input_path ):
90
97
with open (input_path , "r" ) as f :
91
98
tree_string = f .read ()
92
99
formatted = re .sub (r";[^:]+:" , ":" , tree_string )
93
- return Tree (formatted )
100
+ is_duplicated = check_formatted_tree (formatted )
101
+
102
+ return Tree (formatted ), is_duplicated
94
103
95
104
96
105
#####################################################################
@@ -102,12 +111,27 @@ def read_tree(input_path):
102
111
#####################################################################
103
112
104
113
105
- def root_tree (input_path , output_path ):
106
- tre = read_tree (input_path )
114
+ def root_tree (input_path , basename , output_path ):
115
+ tre ,is_duplicated = read_tree (input_path )
116
+ midpoint = tre .get_midpoint_outgroup ()
117
+ tre .set_outgroup (midpoint )
118
+ if is_duplicated :
119
+ outdir = Path (output_path ) / "multiple"
120
+ Path (outdir ).mkdir (exist_ok = True , parents = True )
121
+ output_path = outdir / basename
122
+ output_path = str (output_path ).replace (".tre" , ".tre.multiple" )
123
+ else :
124
+ outdir = Path (output_path ) / "unique"
125
+ Path (outdir ).mkdir (exist_ok = True , parents = True )
126
+ output_path = outdir / basename
127
+
128
+ tre .write (outfile = output_path )
129
+ return tre .write (), len (tre .get_leaves ()), output_path , is_duplicated
130
+
131
+ def root_reference_tree (input_path , output_path ):
132
+ tre , _ = read_tree (input_path )
107
133
midpoint = tre .get_midpoint_outgroup ()
108
134
tre .set_outgroup (midpoint )
109
- if not os .path .exists (os .path .dirname (output_path )):
110
- os .makedirs (os .path .dirname (output_path ))
111
135
tre .write (outfile = output_path )
112
136
return tre .write (), len (tre .get_leaves ())
113
137
@@ -135,20 +159,23 @@ def root_trees(core_tree, gene_trees_path, output_dir, results, merge_pair=False
135
159
rooted_reference_tree = os .path .join (
136
160
output_dir , "rooted_reference_tree/core_gene_alignment.tre"
137
161
)
138
- refer_content , refer_tree_size = root_tree (reference_tree , rooted_reference_tree )
162
+ refer_content , refer_tree_size = root_reference_tree (reference_tree , rooted_reference_tree )
139
163
140
164
df_gene_trees = pd .read_csv (gene_trees_path )
141
165
rooted_gene_trees_path = os .path .join (output_dir , "rooted_gene_trees" )
142
166
for filename in df_gene_trees ["path" ]:
143
167
basename = Path (filename ).name
144
- rooted_gene_tree_path = os .path .join (rooted_gene_trees_path , basename )
145
- gene_content , gene_tree_size = root_tree (filename , rooted_gene_tree_path )
146
- results .loc [basename , "tree_size" ] = gene_tree_size
168
+ gene_content , gene_tree_size , gene_tree_path , is_duplicated = root_tree (
169
+ filename ,
170
+ basename ,
171
+ rooted_gene_trees_path )
172
+ if not is_duplicated :
173
+ results .loc [basename , "tree_size" ] = gene_tree_size
147
174
if merge_pair :
148
- with open (rooted_gene_tree_path , "w" ) as f2 :
175
+ with open (gene_tree_path , "w" ) as f2 :
149
176
f2 .write (refer_content + "\n " + gene_content )
150
177
#'''
151
- return rooted_gene_trees_path
178
+ return os . path . join ( rooted_gene_trees_path , "unique" )
152
179
153
180
154
181
#####################################################################
@@ -212,7 +239,7 @@ def approx_rspr(
212
239
"-length " + str (min_branch_len ),
213
240
"-support " + str (max_support_threshold ),
214
241
]
215
-
242
+
216
243
group_size = 10000
217
244
cur_count = 0
218
245
lst_filename = []
@@ -498,7 +525,7 @@ def main(args=None):
498
525
# Generate group heatmap
499
526
group_fig_path = os .path .join (args .OUTPUT_DIR , "group_output.png" )
500
527
make_group_heatmap (
501
- results ,
528
+ results ,
502
529
group_fig_path ,
503
530
args .MIN_HEATMAP_RSPR_DISTANCE ,
504
531
args .MAX_HEATMAP_RSPR_DISTANCE
0 commit comments