diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 085078bf8ca9a2d98667e28292967d5823de4621..828b4de38372893b62b056b337a7f89695056f93 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -14,7 +14,7 @@ shallow_run_workflow: ## Downloading apps - mkdir appimgs #- apptainer build apps/pggb.sif oras://registry.forgemia.inra.fr/alexis.mergez/pangratools/pggb:latest - - apptainer build appimgs/snakebox.sif oras://registry.forgemia.inra.fr/alexis.mergez/pangratools/snakebox:latest + - apptainer pull appimgs/snakebox.sif oras://registry.forgemia.inra.fr/alexis.mergez/pan1capps/pan1cbox:latest #- apptainer build apps/pytools.sif oras://registry.forgemia.inra.fr/alexis.mergez/pangetools/pytools:latest #- apptainer build apps/PanGeTools.sif oras://registry.forgemia.inra.fr/alexis.mergez/pangetools/pangetools:latest diff --git a/README.md b/README.md index e2f9df24f86057631d2618800ec391ab42a3c2e4..04529fb976ad417e3687aeeb912b73fc1c1548c8 100644 --- a/README.md +++ b/README.md @@ -114,5 +114,6 @@ Pan1c-06AT-v3 This DAG shows the worflow for a pangenome of `Saccharomyces cerevisiae` using the `R64` reference.  - +# Contact +[alexis.mergez@inrae.fr](mailto:alexis.mergez@inrae.fr) diff --git a/Snakefile b/Snakefile index 5c775d64921f519a2a12850c6a2121a93768c39f..71c2bb3b207ac9f8491c82da8a83fe7a1c67fd55 100644 --- a/Snakefile +++ b/Snakefile @@ -1,5 +1,8 @@ configfile: "config.yaml" +include: "rules/tools.smk" + + ## Modules import os import numpy as np @@ -7,15 +10,15 @@ import gzip import re """ - -Main variables used in the workflow - +Main variables used in the workflow --------------------------------------------------------------- """ -# Retrieving the list of haplotypes (SAMPLES) in data/haplotypes + +# Retrieving the list of haplotypes (SAMPLES) in data/haplotypes. +# File with .fa extension only will be used. (.fna will not be used) SAMPLES = np.unique([ - os.path.basename(f).split('.fa')[0] - for f in os.listdir("data/haplotypes/") - if re.search(r"\.fa", f) + os.path.basename(file).split('.fa')[0] + for file in os.listdir("data/haplotypes/") + if re.search(r"\.fa", file) ]) # Retrieving the list of haplotypes excluding the reference @@ -32,9 +35,10 @@ nHAP = len(SAMPLES) with gzip.open("data/haplotypes/"+config['reference'], "r") as handle: CHRLIST = [line.decode().split("#")[-1].split('\n')[0] for line in handle.readlines() if line.decode()[0] == ">"] -# Adjusting the requested outputs according to the config file +# Adding optionnal output based on config.yaml, using the following function def which_analysis(): - # Creating a list with default analysis steps (to prevent the function from returning an empty list) + + ## Default analysis analysis_inputs = [ "output/stats/pan1c."+config['name']+".core.stats.tsv", # core stats expand("output/panacus.reports/"+config['name']+".{chromosome}.histgrowth.html", chromosome=CHRLIST), # panacus histgrowth @@ -42,30 +46,38 @@ def which_analysis(): "output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv" # chromosomes graph statistics ] - # Optionals analysis steps + ## Optionals analysis steps if config["get_PAV"] == "True": # Adding PAV matrix creation analysis_inputs.append("output/pav.matrices") - #if config["get_allASM_SyRI"] == "True": # Creating the figure for all assemblies - # analysis_inputs.append("output/asm.syri.figs/pan1c."+config['name']+".allAsm.syri.png") - if config["get_ASMs_SyRI"] == "True": # Creating SyRI for each figures + + if config["get_ASMs_SyRI"] == "True": # Creating SyRI for each input assembly analysis_inputs.append( - expand("output/asm.syri.figs/pan1c."+config['name']+".{haplotype}.syri.png", haplotype=SAMPLES_NOREF), # syri for haplotypes + expand("output/asm.syri.figs/pan1c."+config['name']+".{haplotype}.syri.png", haplotype=SAMPLES_NOREF) ) - if config["run_Quast"] == "True": # Running quast on haplotypes + if config["get_chrInputs_SyRI"] == "True": # Creating SyRI figures for each PGGB input analysis_inputs.append( - "output/quast/Quast.report.html" + expand("output/chrInput.syri.figs/"+config['name']+".{chromosome}.asm.syri.png", chromosome=CHRLIST) ) - if config["get_contig_pos"] == "True": # Contig position in chromosomes figure + if config["run_Quast"] == "True": # Running Quast on input haplotypes + analysis_inputs.append( + "output/quast/"+config['name']+".quast.report.html" + ) + if config["get_contig_pos"] == "True": # Chromosome decomposition into its contig figure analysis_inputs.append( expand("output/chr.contig/{haplotype}.contig.png", haplotype=CHRLIST) ) - return analysis_inputs -""" + if config["create_report"] == "True": # Creating report (need contig) + analysis_inputs.append( + "output/pan1c."+config['name']+".report.md" + ) -Rules + return analysis_inputs """ +Rules ------------------------------------------------------------------------------------------- +""" + # Main target rule rule all: input: @@ -74,29 +86,9 @@ rule all: which_analysis() """ - -Tool rules - +Pre-processing section : preparing pggb inputs --------------------------------------------------- """ -rule samtools_index: - # Samtools faidx to index compressed fasta - input: - fa="{sample}.fa.gz" - output: - "{sample}.fa.gz.fai", - "{sample}.fa.gz.gzi" - threads: 1 - params: - apppath=config["app.path"] - shell: - "apptainer run --app samtools {params.apppath}/PanGeTools.sif " - "faidx {input.fa}" -""" - -Pre-processing section : preparing pggb inputs - -""" rule ragtag_scaffolding: # Scaffold input haplotype against the reference to infer chromosome scale sequences input: @@ -105,26 +97,32 @@ rule ragtag_scaffolding: refgzi="data/haplotypes/"+config['reference']+".gzi", fa="data/haplotypes/{haplotype}.fa.gz" output: - fa="data/hap.ragtagged/{haplotype}.ragtagged.fa.gz", + fa=temp("data/hap.ragtagged/{haplotype}.ragtagged.fa"), tar="data/hap.ragtagged/{haplotype}.tar.gz" threads: 8 retries: 1 + priority: 100 params: - apppath=config["app.path"] + app_path=config["app.path"] + log: + cmd="logs/ragtag/{haplotype}.ragtag.cmd.log", + time="logs/ragtag/{haplotype}.ragtag.time.log" shell: """ - bash scripts/ragtagChromInfer.sh \ + /usr/bin/time -v -o {log.time} \ + bash scripts/ragtagChromInfer.sh \ -d $(dirname {output.fa})/{wildcards.haplotype} \ - -a {params.apppath} \ + -a {params.app_path} \ -t {threads} \ -r {input.ref} \ -q {input.fa} \ - -o {output.fa} + -o {output.fa} 2>&1 | \ + tee {log.cmd} - if [ ! -s {output.fa} ]; then - echo "Empty fasta" - exit 1 - fi + #if [[ -z $(zgrep '[^[:space:]]' {output.fa}) ]] ; then + # echo "Error : Empty final fasta" + # exit 1 + #fi """ rule quast_stats: @@ -133,14 +131,18 @@ rule quast_stats: fas=expand("data/haplotypes/{haplotype}.fa.gz", haplotype=SAMPLES_NOREF), ref="data/haplotypes/"+config['reference'] output: - report="output/quast/Quast.report.html" - threads: workflow.cores//2 + report="output/quast/"+config['name']+".quast.report.html" + threads: 8 params: - apppath=config["app.path"], - panname=config["name"] + app_path=config["app.path"], + pan_name=config["name"] + log: + cmd="logs/quast/quast.cmd.log", + time="logs/quast/quast.time.log" shell: """ - apptainer run {params.apppath}/pan1c-env.sif quast.py \ + /usr/bin/time -v -o {log.time} \ + apptainer run {params.app_path}/pan1c-env.sif quast.py \ -t {threads} \ -o "$(dirname {output.report})" \ -r {input.ref} \ @@ -149,12 +151,13 @@ rule quast_stats: --no-icarus \ -s \ --large \ - {input.fas} + {input.fas} 2>&1 | \ + tee {log.cmd} # Compressing temporary files - apptainer run --app bgzip {params.apppath}/PanGeTools.sif \ + apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ -@ {threads} $(dirname {output.report})/contigs_reports/*.fa - apptainer run --app bgzip {params.apppath}/PanGeTools.sif \ + apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ -@ {threads} $(dirname {output.report})/contigs_reports/minimap_output/* mv $(dirname {output.report})/report.html {output.report} @@ -170,7 +173,7 @@ rule contig_position: outdir=directory("output/chr.contig/{chromosome}") threads: 1 params: - apppath=config["app.path"] + app_path=config["app.path"] shell: """ mkdir {output.outdir} @@ -178,14 +181,14 @@ rule contig_position: hapID=$(echo {wildcards.chromosome} | sed 's/.hap/#/') # Creating .bands file - apptainer run {params.apppath}/pan1c-env.sif python scripts/fasta_not_NNN_gff3.py \ + apptainer run {params.app_path}/pan1c-env.sif python scripts/fasta_not_NNN_gff3.py \ -s 5 \ -f {input.fa} \ > $base/{wildcards.chromosome}.gff #cat $base/{wildcards.chromosome}.gff - awk '{{a++; if ((a/2)==int(a/2)){{b="gneg"}}else{{b="gpos75"}}; print $1"\\t"$4"\\t"$5"\\tcontig_"a"\\t"b}}' $base/{wildcards.chromosome}.gff | \ + awk '{{a++; if ((a/2)==int(a/2)){{b="gpos25"}}else{{b="gpos75"}}; print $1"\\t"$4"\\t"$5"\\tcontig_"a"\\t"b}}' $base/{wildcards.chromosome}.gff | \ sed '1ichr\\tstart\\tend\\tname\\tgieStain' \ > $base/{wildcards.chromosome}.bands @@ -202,123 +205,110 @@ rule contig_position: #cat $base/{wildcards.chromosome}.tsv # Creating figure - apptainer run --app Renv {params.apppath}/pan1c-env.sif Rscript scripts/contig_position.R \ + apptainer run --app Renv {params.app_path}/pan1c-env.sif Rscript scripts/contig_position.R \ -b $base/{wildcards.chromosome}.bands \ -t $base/{wildcards.chromosome}.tsv \ -o {output.fig} """ -rule clustering: - # Read ragtagged fastas and split chromosome sequences into according bins +rule chromosome_clustering: + # Read ragtagged fastas and split chromosome sequences into according FASTA files input: expand('data/hap.ragtagged/{haplotype}.ragtagged.fa.gz', haplotype=SAMPLES) output: - expand('data/chrInputs/'+config['name']+'.{chromosome}.fa.gz', chromosome=CHRLIST) - threads: workflow.cores + temp(expand('data/chrInputs/'+config['name']+'.{chromosome}.fa', chromosome=CHRLIST)) + threads: 1 + priority: 100 params: - apppath=config["app.path"], - panname=config["name"] + app_path=config["app.path"], + pan_name=config["name"] shell: """ mkdir -p $(dirname {output[0]}) - apptainer run {params.apppath}/pan1c-env.sif python scripts/inputClustering.py \ - --fasta {input} --output $(dirname {output[0]}) --panname {params.panname} - for file in $(dirname {output[0]})/*.fa; do - apptainer run --app bgzip {params.apppath}/PanGeTools.sif -@ {threads} $file - done + apptainer run {params.app_path}/pan1c-env.sif python scripts/inputClustering.py \ + --fasta {input} --output $(dirname {output[0]}) --panname {params.pan_name} """ -rule create_allAsm_syri_fig: - # Create a SyRI figure containing all input assemblies - input: - ref="data/hap.ragtagged/"+config['reference'][:-5]+"ragtagged.fa.gz", - qry=expand('data/hap.ragtagged/{haplotype}.ragtagged.fa.gz', haplotype=SAMPLES) - output: - fig="output/asm.syri.figs/pan1c."+config['name']+".allAsm.syri.png", - wrkdir=directory('data/asm.syri/all') - threads: 8 - params: - apppath=config["app.path"] - shell: - """ - mkdir {output.wrkdir} - - bash scripts/getSyriFigs.sh \ - -a {params.apppath} \ - -t {threads} \ - -d {output.wrkdir} \ - -o $(basename {output.fig}) \ - -r {input.ref} \ - -q "{input.qry}" - mv {output.wrkdir}/$(basename {output.fig}) {output.fig} - """ - -rule create_sglAsm_syri_fig: - # Create a SyRI figure for a single input assembly +rule SyRI_on_ASM: + # Run SyRI on a single assembly. + # The assembly is mapped on the 'reference' and SyRI search for SV. input: ref="data/hap.ragtagged/"+config['reference'][:-5]+"ragtagged.fa.gz", qry="data/hap.ragtagged/{haplotype}.ragtagged.fa.gz" output: fig="output/asm.syri.figs/pan1c."+config['name']+".{haplotype}.syri.png", wrkdir=directory('data/asm.syri/{haplotype}') + log: + cmd="logs/SyRI_ASM/{haplotype}.SyRI_ASM.cmd.log", + time="logs/SyRI_ASM/{haplotype}.SyRI_ASM.time.log" threads: 4 + retries: 1 resources: mem_gb=48 params: - apppath=config["app.path"] + app_path=config["app.path"] shell: """ mkdir -p {output.wrkdir} - bash scripts/getSyriFigs.sh \ - -a {params.apppath} \ + /usr/bin/time -v -o {log.time} \ + bash scripts/getSyriFigs.sh \ + -a {params.app_path} \ -t {threads} \ -d {output.wrkdir} \ -o $(basename {output.fig}) \ -r {input.ref} \ - -q "{input.qry}" + -q "{input.qry}" 2>&1 | \ + tee {log.cmd} + mv {output.wrkdir}/$(basename {output.fig}) {output.fig} """ """ - Core section : Running PGGB - """ -rule create_pggb_input_syri_fig: +rule SyRI_on_chrInput: + # WIP input: fasta='data/chrInputs/'+config['name']+'.{chromosome}.fa.gz' output: fig="output/chrInput.syri.figs/"+config['name']+".{chromosome}.asm.syri.png", - wrkdir=directory('data/chrInput/syri/{chromosome}') + wrkdir=directory('data/chrInputs/syri/{chromosome}') threads: 8 params: - apppath=config["app.path"], + app_path=config["app.path"], ref=config['reference'] shell: """ mkdir {output.wrkdir} - refname=$(basename {params.ref} .fa.gz | cut -d'.' -f1) + refname=$(basename {params.ref} .fa.gz | cut -d'.' -f1,2) ## Creating single fasta from multifasta zcat {input.fasta} | awk -F"#" \ - '/^>/ {OUT="{output.wrkdir}" substr($0,2) ".fa"}; {print >> OUT; close(OUT)}' - + '/^>/ {{OUT="{output.wrkdir}/" substr($0,2) ".fa"}}; {{print >> OUT; close(OUT)}}' + + apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ + -@ {threads} {output.wrkdir}/*.fa + #zgrep '>' {output.wrkdir}/*.fa.gz + ## Getting the list of sequences - asmList=() - for file in {output.wrkdir}/*.fa; do - asm="$(basename $file .fa | cut -f1 -d"#").fa" + AllAsmList=() + for file in {output.wrkdir}/*.fa.gz; do + asm="$(basename $file .fa.gz | cut -f1,2 -d"#" | sed 's/#/\\.hap/').fa.gz" mv $file "$(dirname $file)/$asm" - asmList+=("$asm") + AllAsmList+=("$(dirname $file)/$asm") done + + #echo "The ASM Array : ${{AllAsmList[@]}}" bash scripts/getSyriFigs.sh \ - -a {params.apppath} \ + -a {params.app_path} \ -t {threads} \ -d {output.wrkdir} \ -o $(basename {output.fig}) \ - -r {output.wrkdir}/"${refname}.fa" \ - -q "${asmList[@]}" + -r {output.wrkdir}/"${{refname}}.fa.gz" \ + -q "${{AllAsmList[*]}}" \ + -h 9 -w 16 mv {output.wrkdir}/$(basename {output.fig}) {output.fig} rm {output.wrkdir}/*.fa """ @@ -332,8 +322,9 @@ rule wfmash_on_chr: mapping=temp("data/chrGraphs/{chromosome}/{chromosome}.wfmash.mapping.paf"), aln=temp("data/chrGraphs/{chromosome}/{chromosome}.wfmash.aln.paf") threads: 16 + priority: 100 params: - apppath=config['app.path'], + app_path=config['app.path'], segment_length=config['wfmash.segment_length'], mapping_id=config['wfmash.mapping_id'], wfmash_sec=config['wfmash.secondary'] @@ -346,7 +337,7 @@ rule wfmash_on_chr: """ ## Mapping /usr/bin/time -v -o {log.time_map} \ - apptainer exec {params.apppath}/pggb.sif wfmash \ + apptainer exec {params.app_path}/pggb.sif wfmash \ -s {params.segment_length} -l $(( {params.segment_length} * 5 )) -p {params.mapping_id} \ -n $(cat {input.fai} | wc -l) {params.wfmash_sec} -t {threads} \ --tmp-base $(dirname {output.aln}) \ @@ -356,7 +347,7 @@ rule wfmash_on_chr: ## Aligning /usr/bin/time -v -o {log.time_aln} \ - apptainer exec {params.apppath}/pggb.sif wfmash \ + apptainer exec {params.app_path}/pggb.sif wfmash \ -s {params.segment_length} -l $(( {params.segment_length} * 5 )) -p {params.mapping_id} \ -n $(cat {input.fai} | wc -l) {params.wfmash_sec} -t {threads} \ --tmp-base $(dirname {output.aln}) \ @@ -366,10 +357,10 @@ rule wfmash_on_chr: 2> >(tee {log.cmd_aln} >&2) # Compressing - apptainer run --app bgzip {params.apppath}/PanGeTools.sif \ + apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ -@ {threads} -k {output.mapping} - apptainer run --app bgzip {params.apppath}/PanGeTools.sif \ + apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ -@ {threads} -k {output.aln} """ @@ -381,8 +372,9 @@ rule seqwish: output: temp("data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfa") threads: 8 + priority: 100 params: - apppath=config['app.path'], + app_path=config['app.path'], seqwish=config['seqwish.params'] log: cmd="logs/pggb/{chromosome}.seqwish.cmd.log", @@ -390,14 +382,14 @@ rule seqwish: shell: """ /usr/bin/time -v -o {log.time} \ - apptainer exec {params.apppath}/pggb.sif seqwish \ + apptainer exec {params.app_path}/pggb.sif seqwish \ -s {input.fa} -p {input.aln} -g {output} \ {params.seqwish} -t {threads} \ --temp-dir $(dirname {output}) -P 2>&1 | \ tee {log.cmd} # Compressing - apptainer run --app bgzip {params.apppath}/PanGeTools.sif \ + apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ -@ {threads} -k {output} """ @@ -409,19 +401,20 @@ rule gfaffix_on_chr: gfa=temp("data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfaffixD.gfa"), transform="data/chrGraphs/{chromosome}/{chromosome}.seqwish.gfaffixD.transform.txt" threads: 1 + priority: 100 params: - apppath=config['app.path'] + app_path=config['app.path'] log: time="logs/pggb/{chromosome}.gfaffix.time.log" shell: """ /usr/bin/time -v -o {log.time} \ - apptainer exec {params.apppath}/pggb.sif gfaffix \ + apptainer exec {params.app_path}/pggb.sif gfaffix \ {input} -o {output.gfa} -t {output.transform} \ > /dev/null # Compressing - apptainer run --app bgzip {params.apppath}/PanGeTools.sif \ + apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ -@ {threads} -k {output.gfa} """ @@ -432,8 +425,9 @@ rule odgi_postprocessing: output: gfa='data/chrGraphs/'+config['name']+'.{chromosome}.gfa' threads: 8 + priority: 100 params: - apppath=config['app.path'] + app_path=config['app.path'] log: cmd_build="logs/pggb/{chromosome}.odgi_build.cmd.log", time_build="logs/pggb/{chromosome}.odgi_build.time.log", @@ -449,26 +443,26 @@ rule odgi_postprocessing: ## Converting to odgi format and optimizing namespace /usr/bin/time -v -o {log.time_build} \ - apptainer run --app odgi {params.apppath}/PanGeTools.sif \ + apptainer run --app odgi {params.app_path}/PanGeTools.sif \ build -t {threads} -P -g {input} -o $OGfile.og -O 2>&1 | \ tee {log.cmd_build} ## Unchoping the nodes (merges unitigs into single nodes) /usr/bin/time -v -o {log.time_unchop} \ - apptainer run --app odgi {params.apppath}/PanGeTools.sif \ + apptainer run --app odgi {params.app_path}/PanGeTools.sif \ unchop -P -t {threads} -i $OGfile.og -o $OGfile.unchoped.og 2>&1 | \ tee {log.cmd_unchop} ## Sorting the nodes /usr/bin/time -v -o {log.time_sort} \ - apptainer run --app odgi {params.apppath}/PanGeTools.sif \ + apptainer run --app odgi {params.app_path}/PanGeTools.sif \ sort -P -p Ygs --temp-dir $(dirname $OGfile) -t {threads} \ -i $OGfile.unchoped.og -o $OGfile.unchoped.sorted.og 2>&1 | \ tee {log.cmd_sort} ## Exporting to gfa /usr/bin/time -v -o {log.time_view} \ - apptainer run --app odgi {params.apppath}/PanGeTools.sif \ + apptainer run --app odgi {params.app_path}/PanGeTools.sif \ view -i $OGfile.unchoped.sorted.og -g \ 1> {output.gfa} \ 2> >(tee {log.cmd_view} >&2) @@ -478,12 +472,12 @@ rule odgi_postprocessing: ## Adding metadata python scripts/getTags.py \ - --appdir {params.apppath} --config-file config.yaml \ + --appdir {params.app_path} --config-file config.yaml \ > "$(dirname {input})/metadata.txt" sed -i "/^H/r $(dirname {input})/metadata.txt" {output.gfa} # Compressing - # apptainer run --app bgzip {params.apppath}/PanGeTools.sif \ + # apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ # -@ {threads} -k {output.gfa} """ @@ -494,6 +488,7 @@ rule generate_graph_list: output: "data/chrGraphs/graphsList.txt" threads: 1 + priority: 100 run: with open(output[0], "w") as handle: for file in input: @@ -506,15 +501,23 @@ rule graph_squeeze: graphs=expand('data/chrGraphs/'+config['name']+'.{chromosome}.gfa', chromosome=CHRLIST) output: "output/pan1c."+config['name']+".gfa" + log: + cmd="logs/squeeze/"+config['name']+".squeeze.cmd.log", + time="logs/squeeze/"+config['name']+".squeeze.time.log", threads: 16 + priority: 100 params: - apppath=config['app.path'] + app_path=config['app.path'] shell: """ - apptainer run --app odgi {params.apppath}/PanGeTools.sif \ - squeeze -t {threads} -O -P -f {input.glist} -o {output}.og - apptainer run --app odgi {params.apppath}/PanGeTools.sif \ + /usr/bin/time -v -o {log.time} \ + apptainer run --app odgi {params.app_path}/PanGeTools.sif \ + squeeze -t {threads} -O -P -f {input.glist} -o {output}.og 2>&1 | \ + tee {log.cmd} + + apptainer run --app odgi {params.app_path}/PanGeTools.sif \ view -t {threads} -P -i {output}.og -g > {output} + rm {output}.og """ @@ -527,13 +530,13 @@ rule graph_stats: pathstats="output/stats/chrGraphs/"+config['name']+".{chromosome}.path.stats.tsv" threads: 4 params: - apppath=config['app.path'], - panname=config['name'] + app_path=config['app.path'], + pan_name=config['name'] shell: """ - apptainer run --app gfastats {params.apppath}/PanGeTools.sif \ + apptainer run --app gfastats {params.app_path}/PanGeTools.sif \ -g {input.graph} -P \ - -o $(dirname {output.genstats})/{params.panname}.{wildcards.chromosome} \ + -o $(dirname {output.genstats})/{params.pan_name}.{wildcards.chromosome} \ -t {threads} """ @@ -546,17 +549,17 @@ rule graph_figs: pcov="output/chrGraphs.figs/"+config['name']+".{chromosome}.pcov.png" threads: 4 params: - apppath=config['app.path'], + app_path=config['app.path'], oneDviz=config['odgi.1Dviz.params'], pcov=config['odgi.pcov.params'] shell: """ ## 1D viz - apptainer run --app odgi {params.apppath}/PanGeTools.sif \ + apptainer run --app odgi {params.app_path}/PanGeTools.sif \ viz -i {input.graph} -o {output.oneDviz} {params.oneDviz} -t {threads} -P ## Pcov viz - apptainer run --app odgi {params.apppath}/PanGeTools.sif \ + apptainer run --app odgi {params.app_path}/PanGeTools.sif \ viz -i {input.graph} -o {output.pcov} {params.pcov} -t {threads} -P """ @@ -567,14 +570,15 @@ rule aggregate_graphs_stats: output: genstats="output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv", pathstats="output/stats/pan1c."+config['name']+".chrGraph.path.stats.tsv" + threads: 1 params: - apppath=config['app.path'], - panname=config['name'] + app_path=config['app.path'], + pan_name=config['name'] shell: """ - apptainer run {params.apppath}/pan1c-env.sif python scripts/chrStatsAggregation.py \ + apptainer run {params.app_path}/pan1c-env.sif python scripts/chrStatsAggregation.py \ --input $(dirname {input[0]}) --outputGeneral {output.genstats} \ - --outputPaths {output.pathstats} --panname {params.panname} + --outputPaths {output.pathstats} --panname {params.pan_name} """ rule final_graph_tagging: @@ -584,13 +588,14 @@ rule final_graph_tagging: output: "output/pan1c."+config['name']+".gfa.metadata" threads: 1 + priority: 100 params: - apppath=config['app.path'], - panname=config['name'] + app_path=config['app.path'], + pan_name=config['name'] shell: """ python scripts/getTags.py \ - --appdir {params.apppath} --config-file config.yaml > {output} + --appdir {params.app_path} --config-file config.yaml > {output} sed -i '/^H/r {output}' {input.graph} """ @@ -600,41 +605,38 @@ rule pggb_input_stats: flag="output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv" output: "output/stats/pan1c."+config['name']+".chrInput.stats.tsv" + threads: 1 params: - apppath=config['app.path'], - panname=config['name'] + app_path=config['app.path'], + pan_name=config['name'] shell: """ - apptainer run {params.apppath}/pan1c-env.sif python scripts/chrInputStats.py \ - -f data/chrInputs/*.fa.gz -o {output} -p {params.panname} + apptainer run {params.app_path}/pan1c-env.sif python scripts/chrInputStats.py \ + -f data/chrInputs/*.fa.gz -o {output} -p {params.pan_name} """ rule core_statistics: # Aggregate chrInput, chrGraph and pggb statistics into a single tsv input: - chrInputStats="output/stats/pan1c."+config['name']+".chrInput.stats.tsv", - chrGraphStats="output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv" + chrInputStats = "output/stats/pan1c."+config['name']+".chrInput.stats.tsv", + chrGraphStats = "output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv" output: - tsv="output/stats/pan1c."+config['name']+".core.stats.tsv", - dir=directory("output/pggb.usage.figs") + tsv = "output/stats/pan1c."+config['name']+".core.stats.tsv", + dir = directory("output/pggb.usage.figs") + threads: 1 params: - apppath=config['app.path'], - panname=config['name'] + app_path=config['app.path'], + pan_name=config['name'] shell: """ mkdir -p {output.dir} - apptainer run {params.apppath}/pan1c-env.sif python scripts/coreStats.py \ + apptainer run {params.app_path}/pan1c-env.sif python scripts/coreStats.py \ --pggbStats logs/pggb --chrInputStats {input.chrInputStats} \ - --chrGraphStats {input.chrGraphStats} -o {output.tsv} -f {output.dir} -p {params.panname} + --chrGraphStats {input.chrGraphStats} -o {output.tsv} -f {output.dir} -p {params.pan_name} """ """ - -Post-processing section : - The graph for each chromosome are made as well as some basic statistics. - In this section, more stats are produced but more specifics ones requiring dedicated tools (Panacus, PAVs for Panache ...). - It also contains rules to use the graph itself. - +Post-processing section """ rule get_pav: # Create PAV matrix readable by panache for a given chromosome scale graph @@ -644,7 +646,7 @@ rule get_pav: directory("output/pav.matrices") threads: 16 params: - apppath=config['app.path'] + app_path=config['app.path'] run: shell("mkdir {output}") # Getting the list of graphs @@ -652,7 +654,7 @@ rule get_pav: graphList = [graph.rstrip("\n") for graph in handle.readlines()] # Iterating over graphs for graph in graphList: - shell("bash scripts/getPanachePAV.sh -g {graph} -d data/chrGraphs/$(basename {graph} .gfa) -o {output}/$(basename {graph} .gfa).pav.matrix.tsv -a {params.apppath} -t {threads}") + shell("bash scripts/getPanachePAV.sh -g {graph} -d data/chrGraphs/$(basename {graph} .gfa) -o {output}/$(basename {graph} .gfa).pav.matrix.tsv -a {params.app_path} -t {threads}") rule panacus_stats: # Produces panacus reports for a chromosome graph @@ -660,18 +662,147 @@ rule panacus_stats: graph='data/chrGraphs/'+config['name']+'.{chromosome}.gfa' output: html='output/panacus.reports/'+config['name']+'.{chromosome}.histgrowth.html' + log: + cmd="logs/panacus/{chromosome}.panacus.cmd.log", + time="logs/panacus/{chromosome}.panacus.time.log" params: - apppath=config['app.path'], - panname=config['name'], + app_path=config['app.path'], + pan_name=config['name'], refname=config['reference'] threads: 8 shell: """ - bash scripts/getPanacusHG.sh \ + /usr/bin/time -v -o {log.time} \ + bash scripts/getPanacusHG.sh \ -g {input.graph} \ -r $(basename {params.refname} .fa.gz) \ -d data/chrGraphs/{wildcards.chromosome} \ -o {output.html} \ - -a {params.apppath} \ - -t {threads} + -a {params.app_path} \ + -t {threads} 2>&1 | \ + tee {log.cmd} """ + +rule create_pan1c_report_fig: + # Produces a markdown report figure of chromosomes graphs + input: + graph='data/chrGraphs/'+config['name']+'.{chromosome}.gfa', + contigfig="output/chr.contig/{chromosome}.contig.png", + output: + odgifig=temp("tmp/{chromosome}.odgi.png"), + namefig=temp("tmp/{chromosome}.name.png"), + reportfig="output/report/"+config['name']+".{chromosome}.report.fig.png" + threads: 4 + params: + app_path=config['app.path'] + shell: + """ + ## Odgi 1D viz + apptainer run --app odgi {params.app_path}/PanGeTools.sif \ + viz -i {input.graph} -o {output.odgifig} -x 2500 -a 80 -b -H -t {threads} -P + + ## Getting legend from contig figure + convert {input.contigfig} -crop 790x+0+0 +repage {output.namefig} + + odgheight=$(identify -ping -format '%h' {output.odgifig}) + ctgheight=$(identify -ping -format '%h' {input.contigfig}) + + combinedheight=$(($odgheight+$ctgheight)) + + echo -e "[Debug::Report fig]\t$odgheight\t$ctgheight\t$combinedheight" + + ## Creating empty canvas ad adding other figures to it + convert -size "3300x$combinedheight" xc:white {output.reportfig} + composite -geometry +0+0 {input.contigfig} {output.reportfig} {output.reportfig} + composite -geometry "+0+$ctgheight" {output.namefig} {output.reportfig} {output.reportfig} + composite -geometry "+790+$ctgheight" {output.odgifig} {output.reportfig} {output.reportfig} + """ + +def get_report_sections(wildcards): + """ + Return 'create_pan1c_report' optional inputs to add them to the final report. + For example : + - SyRI figures on Assemblies + """ + sections = dict() + + sections["metadata"] = "output/pan1c."+config['name']+".gfa.metadata" + sections["odgifigs"] = expand("output/report/"+config['name']+".{chromosome}.report.fig.png", chromosome=CHRLIST) + sections["genstats"] = "output/stats/pan1c."+config['name']+".chrGraph.general.stats.tsv" + sections["pathstats"] = "output/stats/pan1c."+config['name']+".chrGraph.path.stats.tsv" + + if config["get_ASMs_SyRI"] == "True": + sections["SyRI_on_ASMs_figs"] = expand( + "output/asm.syri.figs/pan1c."+config['name']+".{haplotype}.syri.png", + haplotype=SAMPLES_NOREF + ) + + return sections + +rule create_pan1c_report: + # Produces a markdown report of chromosomes graphs + input: + unpack(get_report_sections) + output: + report="output/pan1c."+config['name']+".report.md" + threads: 4 + params: + app_path=config['app.path'], + add_SyRI=config['get_ASMs_SyRI'] + run: + shell("touch {output.report}") + + # Adding Summary (made for importing in Joplin) + shell("echo '# Summary' >> {output.report}") + shell("echo '[TOC]' >> {output.report}") + shell("echo '' >> {output.report}") + + # Adding graph construction info + ## WIP + + # Adding metadata + shell("echo '# Graph metadata' >> {output.report}") + shell("echo '```' >> {output.report}") + shell("cat {input.metadata} >> {output.report}") + shell("echo '```' >> {output.report}") + shell("echo '' >> {output.report}") + + # Adding General stats + shell("echo '# General stats' >> {output.report}") + shell("cat {input.genstats} | csv2md -d $'\\t' >> {output.report}") + shell("echo '' >> {output.report}") + + # Adding Path stats + shell("echo '# Path stats' >> {output.report}") + shell("cat {input.pathstats} | csv2md -d $'\\t' >> {output.report}") + shell("echo '' >> {output.report}") + + # Adding chromosomes figures + fig_list = [fig for fig in input.odgifigs] + print(fig_list) + fig_list.sort() + + shell("echo '# Chromosome-scale odgi graphs' >> {output.report}") + for fig in fig_list: + basename=os.path.basename(fig) + chr_name=basename.split('.')[1] + + shell("echo '## {chr_name}' >> {output.report}") + shell("echo '' >> {output.report}") + shell("echo '' >> {output.report}") + + # Adding SyRI figs if produced + if params.add_SyRI: + + fig_list = [fig for fig in input.SyRI_on_ASMs_figs] + fig_list.sort() + + shell("echo '# SyRI on input assemblies' >> {output.report}") + for fig in fig_list: + basename = os.path.basename(fig) + hap_name = basename.split('.')[2:4] + + shell("echo '## {hap_name[0]}, {hap_name[1]}' >> {output.report}") + shell("echo '' >> {output.report}") + + shell("echo '' >> {output.report}") \ No newline at end of file diff --git a/config.yaml b/config.yaml index f60452356b61a29d0621873d279379a2b55634ed..0a8502abe2dabedd0b05e95e023f130705635997 100644 --- a/config.yaml +++ b/config.yaml @@ -8,16 +8,16 @@ app.path: '<path>' # Core parameters # Wfmash alignement parameters : -wfmash.segment_length: 5000 -wfmash.mapping_id: 90 +wfmash.segment_length: 10000 +wfmash.mapping_id: 95 wfmash.secondary: '-k 19 -H 0.001 -X' # Seqwish parameters seqwish.params: '-B 10000000 -k 19 -f 0' # Odgi 1D and path coverage viz parameters -odgi.1Dviz.params: '-x 2000 -a 25 -b' -odgi.pcov.params: '-x 2000 -a 25 -O' +odgi.1Dviz.params: '-x 2000 -b' +odgi.pcov.params: '-x 2000 -O' ## Optional parts of the workflow # Running Quast to get statistics on input haplotypes @@ -27,5 +27,7 @@ get_contig_pos: 'True' # Computes Presence Absence Variant matrices for Panache (not recommended; very long) get_PAV: 'False' # Computes SyRI figures for haplotypes -#get_allASM_SyRI: 'False' # All vs all get_ASMs_SyRI: 'False' # Haplotype vs Reference +get_chrInputs_SyRI: 'False' # SyRI on chrInputs +# Creating final report +create_report: 'True' diff --git a/data/.gitkeep b/data/.gitkeep new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/example/config_CICD.yaml b/example/config_CICD.yaml index 4a415bf1e89dd488fbcad3ab36e8a447e6dcd46e..cc0c279fee81b7a6f5c2f430f2af704756ed58bc 100644 --- a/example/config_CICD.yaml +++ b/example/config_CICD.yaml @@ -28,4 +28,7 @@ get_contig_pos: 'True' get_PAV: 'False' # Computes SyRI figures for haplotypes #get_allASM_SyRI: 'False' # All vs all -get_ASMs_SyRI: 'False' # Haplotype vs Reference +get_ASMs_SyRI: 'True' # Haplotype vs Reference +get_chrInputs_SyRI: 'True' # SyRI on chrInputs +# Creating final report +create_report: 'True' diff --git a/rules/tools.smk b/rules/tools.smk new file mode 100644 index 0000000000000000000000000000000000000000..53fe1ff12207014832f44559b92bf28efd907294 --- /dev/null +++ b/rules/tools.smk @@ -0,0 +1,31 @@ +""" +Tool rules --------------------------------------------------------------------------------------- +""" +rule samtools_index: + # Samtools faidx to index compressed fasta + input: + fa="{sample}.fa.gz" + output: + "{sample}.fa.gz.fai", + "{sample}.fa.gz.gzi" + threads: 1 + params: + app_path=config["app.path"] + shell: + "apptainer run --app samtools {params.app_path}/PanGeTools.sif " + "faidx {input.fa}" + +rule run_bgzip: + # Run BGZIP on the file + input: + "{file}" + output: + "{file}.gz" + threads: 4 + params: + app_path=config["app.path"] + shell: + """ + apptainer run --app bgzip {params.app_path}/PanGeTools.sif \ + -@ {threads} {input} + """ \ No newline at end of file diff --git a/scripts/chrInputStats.py b/scripts/chrInputStats.py index 8b3015b3cf51838b537562ebb387cc71cac19fe7..85d3d26d89a3320d5170b725aad61bbea7e77641 100644 --- a/scripts/chrInputStats.py +++ b/scripts/chrInputStats.py @@ -55,6 +55,9 @@ Sequence dictionnary : """ seqDict = {} +# Sorting +args.fastafiles.sort() + # Parsing fasta files for filename in args.fastafiles: diff --git a/scripts/chrStatsAggregation.py b/scripts/chrStatsAggregation.py index 3ac2b1016daff05e8f269cf3db18c3991a17d8fa..404549e904e0f11ab1309512b68b72fa497b3e5f 100644 --- a/scripts/chrStatsAggregation.py +++ b/scripts/chrStatsAggregation.py @@ -43,6 +43,8 @@ args = arg_parser.parse_args() ## Main script # Getting the list of stats files fileList = [k for k in os.listdir(args.inputDir) if k[-3:] == "tsv"] +# Sorting +fileList.sort() stats = { "general": [], diff --git a/scripts/contig_position.R b/scripts/contig_position.R index 7ccd486af6227e20d36950d4ae344f150748a636..b0eeb0a10bd783f1160aaac4bbfa5a739a79abf7 100644 --- a/scripts/contig_position.R +++ b/scripts/contig_position.R @@ -23,6 +23,7 @@ x = read.table(opt$tsv, sep = "\t", comment.char="^") colnames(x) = x[1,] x = x[-1,] my.genome <- toGRanges(x) +nChr = nrow(x) #print("Reading bands file ...") x = read.table(opt$bands, sep="\t", comment.char="^") @@ -32,15 +33,45 @@ my.cytobands <- toGRanges(x) # Creating figure # Customizing figure parameters : see https://bernatgel.github.io/karyoploter_tutorial//Tutorial/PlotParams/PlotParams.html + pp <- getDefaultPlotParams(plot.type=1) -pp$ideogramheight <- 200 -pp$data1outmargin <- 50 -pp$leftmargin <- 0.28 -pp$rightmargin <- 0 -pp$data1height <- 0 -pp$data2height <- 0 - -png(opt$out, width=2780, height=500, pointsize=6, res=300, antialias = "none") + +if (nChr >= 4){ + + pp$ideogramheight <- 50 + pp$data1outmargin <- 15 + pp$data2outmargin <- 15 + pp$data1inmargin <- 0 + pp$data2inmargin <- 0 + pp$leftmargin <- 0.24 + pp$rightmargin <- 0 + pp$topmargin <- 0 + pp$bottommargin <- 0 + pp$data1height <- 0 + pp$data2height <- 0 + + png(opt$out, width=3300, height=(nChr*80), pointsize=6, res=300, antialias = "none") + +} else { + # Too low chromosomes cause crash in par handler. + pp$ideogramheight <- 50 + pp$data1outmargin <- 15 + pp$data2outmargin <- 15 + pp$data1inmargin <- 0 + pp$data2inmargin <- 0 + pp$leftmargin <- 0.24 + pp$rightmargin <- 0 + pp$topmargin <- 0.5 + pp$bottommargin <- 0 + pp$data1height <- 0 + pp$data2height <- 0 + + png(opt$out, width=3300, height=(nChr*80*2), pointsize=6, res=300, antialias = "none") + +} + + + kp <- plotKaryotype(genome=my.genome, cytobands=my.cytobands, plot.params=pp, chromosomes="all") kp <- kpAddBaseNumbers(kp, cex=0.6, tick.dist=5000000) dev.off() \ No newline at end of file diff --git a/scripts/coreStats.py b/scripts/coreStats.py index 91e693fc7aa1bc9bcb03f7c889ead49a48958363..db0991c3640829cbc0c71ed4e116a6c128d610d7 100644 --- a/scripts/coreStats.py +++ b/scripts/coreStats.py @@ -72,6 +72,10 @@ arg_parser.add_argument( ) args = arg_parser.parse_args() +# Sorting +args.chrGraphStatsFiles.sort() +args.chrInputStatsFiles.sort() + ## Toolbox def getRegEquation(x, y): (slope, intercept), ssr, _1, _2, _3 = np.polyfit(x, y, 1, full = True) @@ -89,6 +93,7 @@ PGGBsteps = [] # Storing PGGB steps name # Iterating through all pggb time stats TimeStatsList = [os.path.join(args.pggbStats, file) for file in os.listdir(args.pggbStats) if file.split('.')[-2] == "time"] +TimeStatsList.sort() print(TimeStatsList) for pggbStats in TimeStatsList: diff --git a/scripts/getPanacusHG.sh b/scripts/getPanacusHG.sh index 7f3ddb994947ad138d33d268496a1389a157d907..d26dda39bc02a53159ec9b6a3c79203e3838b1e9 100755 --- a/scripts/getPanacusHG.sh +++ b/scripts/getPanacusHG.sh @@ -35,4 +35,4 @@ grep '^P' $gfa | cut -f2 | grep -ve "$ref" > ${chrdir}/$chrname.paths.noref.txt # Running Panacus echo "[getPanacusHG::panacus] Running on ${gfa}" apptainer run --app panacus $appdir/PanGeTools.sif histgrowth \ - -t $threads -l 1,2,1,1,1 -q 0,0,1,0.5,0.1 -S -a -s ${chrdir}/$chrname.paths.noref.txt -c all -o html $gfa > ${output} + -t $threads -l 1,2,1,1,1 -q 0,0,1,0.5,0.1 -a -s ${chrdir}/$chrname.paths.noref.txt -c all -o html $gfa > ${output} diff --git a/scripts/getSyriFigs.sh b/scripts/getSyriFigs.sh index 3f3c7ffa76fec871d1f932552590313c3f87000f..84f93c2bf82f8bff51581690f4c6ef7b85351ffa 100755 --- a/scripts/getSyriFigs.sh +++ b/scripts/getSyriFigs.sh @@ -9,9 +9,11 @@ appdir="" # Directory containing apptainer images threads="" wrkdir="" # Working directory (directory used by pggb to store step files like .paf, etc...) output="" # Output Syri figure(s) +height=16 # Figure height +width=9 # Figure width ## Getting arguments -while getopts "r:q:a:t:d:o:" option; do +while getopts "r:q:a:t:d:o:h:w:" option; do case "$option" in r) ref="$OPTARG";; q) qry="$OPTARG";; @@ -19,7 +21,9 @@ while getopts "r:q:a:t:d:o:" option; do t) threads="$OPTARG";; d) wrkdir="$OPTARG";; o) output="$OPTARG";; - \?) echo "Usage: $0 [-r ref] [-q query] [-a appdir] [-t threads] [-d wrkdir] [-o output]" >&2 + h) height="$OPTARG";; + w) width="$OPTARG";; + \?) echo "Usage: $0 [-r ref] [-q query] [-a appdir] [-t threads] [-d wrkdir] [-o output] [-h height] [-w width]" >&2 exit 1;; esac done @@ -27,15 +31,22 @@ done ## Main script # Reading query argument and creating an array containing the path to query fasta files IFS=' ' read -r -a temp <<< "$qry" +readarray -td '' temp_sorted < <(printf '%s\0' "${temp[@]}" | sort -z) + +#echo "Query : $qry" +#echo "tempArr : ${temp[@]}" asmList=("$ref") + # Sorting the array to put the reference in first -for item in "${temp[@]}"; do +for item in "${temp_sorted[@]}"; do if [[ $item != "$ref" ]]; then asmList+=($item) fi done +#echo "The array : ${asmList[@]}" + # Array to store the created syri files syriFileList=() @@ -45,13 +56,26 @@ for ((i = 0; i < ${#asmList[@]} - 1; i++)); do # Setting filepaths for later bamFile="$(basename ${asmList[i]} .fa.gz)_$(basename ${asmList[i + 1]} .fa.gz).bam" syriFile="$(basename ${asmList[i]} .fa.gz)_$(basename ${asmList[i + 1]} .fa.gz).syri.out" + hapID=$(basename ${asmList[i + 1]}) + refID=$(basename ${asmList[i]}) + + # Debug + echo -e "\n[Debug::SyRI_script::$hapID] hapID: $hapID\trefID: $refID\n" # Adding the output syri file to the array syriFileList+=($syriFile) # Prunning fasta files based of common chromosomes apptainer run $appdir/pan1c-env.sif python scripts/syri_pruning.py \ - -r ${asmList[i]} -f ${asmList[i + 1]} -o $wrkdir + -r ${asmList[i]} -f ${asmList[i + 1]} -o $wrkdir -d + + #echo "\n[Debug::SyRI_script::$hapID] REF" + #grep '>' $wrkdir/$(basename ${asmList[i]} .gz) + #echo "\n[Debug::SyRI_script::$hapID] QRY" + #grep '>' $wrkdir/$(basename ${asmList[i + 1]} .gz) + + # Renaming chromosomes with same ids as the reference (Not working) + #sed -i "s/${hapID}#chr/${refID}#chr/g" $wrkdir/$(basename ${asmList[i + 1]} .gz) #Â Minimap2 genome vs genome alignment apptainer run --app minimap2 $appdir/PanGeTools.sif \ @@ -60,11 +84,12 @@ for ((i = 0; i < ${#asmList[@]} - 1; i++)); do -t $threads > $wrkdir/$bamFile.sam apptainer run --app samtools $appdir/PanGeTools.sif sort -O BAM -@ $threads $wrkdir/$bamFile.sam > $wrkdir/$bamFile - rm $wrkdir/$bamFile.sam $wrkdir/*.fa + rm $wrkdir/$bamFile.sam + #rm $wrkdir/*.fa # Syri on previous alignment apptainer run $appdir/pan1c-env.sif \ - syri -c $wrkdir/$bamFile -r ${asmList[i]} -q ${asmList[i + 1]} -k -F B \ + syri -c $wrkdir/$bamFile -r $wrkdir/$(basename ${asmList[i]} .gz) -q $wrkdir/$(basename ${asmList[i + 1]} .gz) -k -F B \ --nc $threads \ --dir $wrkdir --prefix "$(basename ${asmList[i]} .fa.gz)_$(basename ${asmList[i + 1]} .fa.gz)." done @@ -73,11 +98,11 @@ done # Each line contains 2 columns : fasta filepath and its simpler name echo -e "#files\tname" > $wrkdir/genomes.txt for asm in "${asmList[@]}"; do - echo -e "${asm}\t$(basename $asm .fa.gz | cut -d'.' -f1)" >> $wrkdir/genomes.txt + echo -e "$wrkdir/$(basename ${asm} .gz)\t$(basename $asm .fa.gz | cut -d'.' -f1)" >> $wrkdir/genomes.txt done # Generating the plotsr command -command="--genomes $wrkdir/genomes.txt -o $wrkdir/$output -f 12 -H 16 -W 9 -d 600 " +command="--genomes $wrkdir/genomes.txt -o $wrkdir/$output -f 12 -H $height -W $width -d 600 " # Adding syri files to the command as each needs to be specified using "--sr" argument for file in "${syriFileList[@]}"; do diff --git a/scripts/ragtagChromInfer.sh b/scripts/ragtagChromInfer.sh index 9afa5384bf8715eac61d33751227618110c48b0e..b8dc8eab24e535d82c956c025b540d620c8c2c29 100755 --- a/scripts/ragtagChromInfer.sh +++ b/scripts/ragtagChromInfer.sh @@ -41,11 +41,11 @@ grep ">${ref}#chr*" -A1 $tmpdir/ragtag.scaffold.fasta | \ sed "s/${ref}#chr\([^_]*\)_RagTag/${hapID}#chr\1/g" > $tmpdir/${sample}.ragtagged.fa # Compressing fasta -apptainer run --app bgzip $appdir/PanGeTools.sif \ - -@ $threads $tmpdir/${sample}.ragtagged.fa +# apptainer run --app bgzip $appdir/PanGeTools.sif \ +# -@ $threads $tmpdir/${sample}.ragtagged.fa # Moving fa.gz to output dir -mv $tmpdir/${sample}.ragtagged.fa.gz $output +mv $tmpdir/${sample}.ragtagged.fa $output # Compressing temporary files tar --remove-files -cf $tmpdir.tar $tmpdir diff --git a/scripts/syri_pruning.py b/scripts/syri_pruning.py index 811e2b97dc1ee9d0a3b18815489ff4149b9b32d1..2482cb24e6f87e123622e10cf1a8a5694f99c528 100644 --- a/scripts/syri_pruning.py +++ b/scripts/syri_pruning.py @@ -62,6 +62,7 @@ with gzip.open(args.ref, 'rt') as handle: chrname = record.id.split("#")[-1] chrList.append(chrname) seqDict[name][chrname] = record + seqDict[name][chrname].id = chrname # Parsing fasta files for filename in args.fastafiles: @@ -74,6 +75,7 @@ for filename in args.fastafiles: for record in sequences: chrname = record.id.split("#")[-1] seqDict[name][chrname] = record + seqDict[name][chrname].id = chrname # Getting the list of common chromosomes for hap, chromosomes in seqDict.items():