Skip to content

Commit 95c5489

Browse files
committed
Merge remote-tracking branch 'origin/master' into fix-samplesheets
* origin/master: fix: Compress output file in EvolCCM (#192) fix: Run rSPR when tree has duplicated nodes (#191) refactor: Save cluster tree with cluster prob (#190)
2 parents f1c93b6 + 7142981 commit 95c5489

File tree

5 files changed

+78
-22
lines changed

5 files changed

+78
-22
lines changed

bin/ParallelEvolCCM.R

+4-2
Original file line numberDiff line numberDiff line change
@@ -458,19 +458,21 @@ for (result in results) {
458458
temp_files <- list.files(pattern = "output_temp_chunk_[0-9]+\\.txt")
459459

460460
outputHeadings <- paste("feature1","feature2","intrisic1", "intrisic2", "gainloss1", "gainloss2", "interaction", "interact_score", "interact_pval",sep="\t")
461-
cat(outputHeadings, file = outputFile, sep = "\n")
461+
outputFilegz = gzfile(outputFile, "w")
462+
cat(outputHeadings, file = outputFilegz, sep = "\n")
462463
for (temp_file in temp_files) {
463464
# Read the contents of the temporary file
464465
temp_contents <- readLines(temp_file)
465466

466467
# Write the contents to the main output file
467-
cat(temp_contents, file = outputFile, append = TRUE, sep = "\n")
468+
cat(temp_contents, file = outputFilegz, append = TRUE, sep = "\n")
468469

469470
# Remove the temporary file
470471
file.remove(temp_file)
471472
}
472473

473474
# Read the lines from the input file
475+
close(outputFilegz)
474476
lines <- readLines(outputFile)[-1]
475477
X2file = paste(outputFile,"X2",sep = ".")
476478
Pfile = paste(outputFile,"pvals",sep = ".")

bin/rspr_approx.py

+40-13
Original file line numberDiff line numberDiff line change
@@ -85,12 +85,21 @@ def parse_args(args=None):
8585
)
8686
return parser.parse_args(args)
8787

88+
def check_formatted_tree(tree_string):
89+
"""Check if formatted tree has duplicate nodes"""
90+
91+
pattern = r'([a-zA-Z]+\w{3,}):.*\1'
92+
match = re.search(pattern, tree_string)
93+
94+
return bool(match)
8895

8996
def read_tree(input_path):
9097
with open(input_path, "r") as f:
9198
tree_string = f.read()
9299
formatted = re.sub(r";[^:]+:", ":", tree_string)
93-
return Tree(formatted)
100+
is_duplicated = check_formatted_tree(formatted)
101+
102+
return Tree(formatted), is_duplicated
94103

95104

96105
#####################################################################
@@ -102,12 +111,27 @@ def read_tree(input_path):
102111
#####################################################################
103112

104113

105-
def root_tree(input_path, output_path):
106-
tre = read_tree(input_path)
114+
def root_tree(input_path, basename, output_path):
115+
tre,is_duplicated = read_tree(input_path)
116+
midpoint = tre.get_midpoint_outgroup()
117+
tre.set_outgroup(midpoint)
118+
if is_duplicated:
119+
outdir = Path(output_path) / "multiple"
120+
Path(outdir).mkdir(exist_ok=True, parents=True)
121+
output_path = outdir / basename
122+
output_path = str(output_path).replace(".tre", ".tre.multiple")
123+
else:
124+
outdir = Path(output_path) / "unique"
125+
Path(outdir).mkdir(exist_ok=True, parents=True)
126+
output_path = outdir / basename
127+
128+
tre.write(outfile=output_path)
129+
return tre.write(), len(tre.get_leaves()), output_path, is_duplicated
130+
131+
def root_reference_tree(input_path, output_path):
132+
tre, _ = read_tree(input_path)
107133
midpoint = tre.get_midpoint_outgroup()
108134
tre.set_outgroup(midpoint)
109-
if not os.path.exists(os.path.dirname(output_path)):
110-
os.makedirs(os.path.dirname(output_path))
111135
tre.write(outfile=output_path)
112136
return tre.write(), len(tre.get_leaves())
113137

@@ -135,20 +159,23 @@ def root_trees(core_tree, gene_trees_path, output_dir, results, merge_pair=False
135159
rooted_reference_tree = os.path.join(
136160
output_dir, "rooted_reference_tree/core_gene_alignment.tre"
137161
)
138-
refer_content, refer_tree_size = root_tree(reference_tree, rooted_reference_tree)
162+
refer_content, refer_tree_size = root_reference_tree(reference_tree, rooted_reference_tree)
139163

140164
df_gene_trees = pd.read_csv(gene_trees_path)
141165
rooted_gene_trees_path = os.path.join(output_dir, "rooted_gene_trees")
142166
for filename in df_gene_trees["path"]:
143167
basename = Path(filename).name
144-
rooted_gene_tree_path = os.path.join(rooted_gene_trees_path, basename)
145-
gene_content, gene_tree_size = root_tree(filename, rooted_gene_tree_path)
146-
results.loc[basename, "tree_size"] = gene_tree_size
168+
gene_content, gene_tree_size, gene_tree_path, is_duplicated = root_tree(
169+
filename,
170+
basename,
171+
rooted_gene_trees_path)
172+
if not is_duplicated:
173+
results.loc[basename, "tree_size"] = gene_tree_size
147174
if merge_pair:
148-
with open(rooted_gene_tree_path, "w") as f2:
175+
with open(gene_tree_path, "w") as f2:
149176
f2.write(refer_content + "\n" + gene_content)
150177
#'''
151-
return rooted_gene_trees_path
178+
return os.path.join(rooted_gene_trees_path, "unique")
152179

153180

154181
#####################################################################
@@ -212,7 +239,7 @@ def approx_rspr(
212239
"-length " + str(min_branch_len),
213240
"-support " + str(max_support_threshold),
214241
]
215-
242+
216243
group_size = 10000
217244
cur_count = 0
218245
lst_filename = []
@@ -498,7 +525,7 @@ def main(args=None):
498525
# Generate group heatmap
499526
group_fig_path = os.path.join(args.OUTPUT_DIR, "group_output.png")
500527
make_group_heatmap(
501-
results,
528+
results,
502529
group_fig_path,
503530
args.MIN_HEATMAP_RSPR_DISTANCE,
504531
args.MAX_HEATMAP_RSPR_DISTANCE

bin/rspr_exact.py

+3-3
Original file line numberDiff line numberDiff line change
@@ -88,7 +88,7 @@ def fpt_rspr(results_df, min_branch_len=0, max_support_threshold=0.7, gather_clu
8888
"-support " + str(max_support_threshold),
8989
]
9090

91-
trees_path = os.path.join("rooted_gene_trees")
91+
trees_path = os.path.join("rooted_gene_trees/unique")
9292

9393
cluster_file = None
9494
if gather_cluster_info:
@@ -123,13 +123,13 @@ def fpt_rspr(results_df, min_branch_len=0, max_support_threshold=0.7, gather_clu
123123
continue
124124
elif "Clusters end" in line:
125125
clustering_start = False
126-
126+
127127
if clustering_start:
128128
updated_line = line.replace('(', '').replace(')', '').replace('\n', '')
129129
cluster_nodes = updated_line.split(',')
130130
cluster_nodes = [int(node) for node in cluster_nodes if "X" not in node]
131131
clusters.append(cluster_nodes)
132-
132+
133133
output_lines.append(line)
134134
cluster_file.write(json.dumps(clusters) + '\n')
135135
process.wait()

bin/rspr_heatmap.py

+30-3
Original file line numberDiff line numberDiff line change
@@ -250,6 +250,34 @@ def generate_cluster_network(lst_tree_clusters, refer_tree):
250250
update_cluster_probability(refer_tree.root, dict_clstr_map, total_trees, leaf_mapping)
251251

252252

253+
#####################################################################
254+
### FUNCTION REPLACE_CLUSTER_PROBABILITY
255+
### Replace branch support with cluster probability
256+
### node: current node
257+
#####################################################################
258+
259+
def replace_cluster_probability(node):
260+
if not node:
261+
return
262+
node.comment = node.cluster_probability
263+
if not node.is_terminal():
264+
for child in node.clades:
265+
replace_cluster_probability(child)
266+
267+
268+
#####################################################################
269+
### FUNCTION SAVE_CLUSTER_TREE
270+
### Save cluster network tree
271+
### cluster_file_path: path to store cliuster tree
272+
### refer_tree: reference tree
273+
#####################################################################
274+
275+
def save_cluster_tree(cluster_file_path, refer_tree):
276+
print("Saving cluster tree")
277+
replace_cluster_probability(refer_tree.root)
278+
Phylo.write(refer_tree, cluster_file_path, "newick")
279+
280+
253281
#####################################################################
254282
### FUNCTION GENERATE_CLUSTER_HEATMAP
255283
### Generate cluster heatmap
@@ -301,8 +329,6 @@ def read_tree(input_path):
301329
formatted = re.sub(r";[^:]+:", ":", tree_string)
302330
return Phylo.read(io.StringIO(formatted), "newick")
303331

304-
def write_tree(output_path, data):
305-
Phylo.write(data, output_path, "newick")
306332

307333
def get_fig_size(refer_tree):
308334
max_fig_size = 100
@@ -350,7 +376,6 @@ def main(args=None):
350376
refer_tree = read_tree(refer_tree_path)
351377
if refer_tree:
352378
generate_cluster_network(lst_tree_clusters, refer_tree)
353-
write_tree(cluster_file_path, refer_tree)
354379

355380
plt.rcParams['font.size'] = '12'
356381
fig_size = get_fig_size(refer_tree)
@@ -361,5 +386,7 @@ def main(args=None):
361386
plt.title("Cluster network")
362387
plt.savefig(cluster_tree_path, format="png")
363388

389+
save_cluster_tree(cluster_file_path, refer_tree)
390+
364391
if __name__ == "__main__":
365392
sys.exit(main())

modules/local/evolccm.nf

+1-1
Original file line numberDiff line numberDiff line change
@@ -34,7 +34,7 @@ process EVOLCCM {
3434

3535
stub:
3636
"""
37-
touch EvolCCM_test.tsv
37+
touch EvolCCM_test.tsv.gz
3838
touch EvolCCM_test.tsv.pvals
3939
touch EvolCCM_test.tsv.X2
4040
touch EvolCCM_test.tre

0 commit comments

Comments
 (0)