From 7d3a9fa86e77b8e09cad0c875d02247c8e3a4da7 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Mon, 4 Mar 2024 10:05:46 +0100
Subject: [PATCH 01/24] Removed old script

---
 scripts/analyzeWorflowData.py | 113 ----------------------------------
 1 file changed, 113 deletions(-)
 delete mode 100644 scripts/analyzeWorflowData.py

diff --git a/scripts/analyzeWorflowData.py b/scripts/analyzeWorflowData.py
deleted file mode 100644
index de0e259..0000000
--- a/scripts/analyzeWorflowData.py
+++ /dev/null
@@ -1,113 +0,0 @@
-"""
-Clustering script for Pan1c workflow
-
-Given a list of fasta files with records ids following the pattern <haplotype>#<chromosome id>,
-the script clusters sequence by chromosome and returns several fasta. 
-Each output fasta contains sequences related to one chromosome only.
-
-@author: alexis.mergez@inrae.fr
-@version: 1.0 
-"""
-
-from Bio import SeqIO
-import numpy as np
-import pandas as pd
-import argparse
-import gzip
-import os
-import seaborn as sns
-import matplotlib.pyplot as plt
-
-## Arguments
-arg_parser = argparse.ArgumentParser(description='Pan1c input haplotype clustering')
-arg_parser.add_argument(
-    "--statsfiles",
-    "-i",
-    nargs="+",
-    dest = "statsfiles",
-    required = True,
-    help = "Workflow statistics file(s)"
-    )
-arg_parser.add_argument(
-    "--output",
-    "-o",
-    dest = "outdir",
-    required = True,
-    help = "Output directory"
-    )
-arg_parser.add_argument(
-    "--debug",
-    "-d",
-    action="store_true",
-    dest = "debug",
-    help = "Show debug"
-    )
-args = arg_parser.parse_args()
-
-## Toolbox
-def getRegEquation(x, y):
-    (slope, intercept), ssr, _1, _2, _3 = np.polyfit(x, y, 1, full = True)
-    ymean = np.mean(y)
-    sst = np.sum((y - ymean)**2)
-    r2 = (1 - (ssr/sst))[0]
-    return f'y = {slope:.2f}x + {intercept:.2f} (R2 : {r2:.2f})'
-
-## Main script
-dfList = []
-for file in args.statsfiles:
-    dfList.append(
-        pd.read_csv(
-            file,
-            sep='\t',
-        )
-    )
-
-df = pd.concat(dfList, ignore_index = True)
-
-if args.debug: print(df)
-df.to_csv("Final.tsv", sep='\t', index=False)
-
-# Memory versus mean base count
-sns.regplot(x=df["input.mean.length"], y=df.mem, line_kws={"color":"r","alpha":0.7,"lw":5})
-equation = getRegEquation(x=df["input.mean.length"], y=df.mem)
-plt.xlabel('Mean input sequences length (#bases)')
-plt.ylabel('Peak memory usage (GB)')
-plt.annotate(equation, xy=(0.05, 0.95), xycoords='axes fraction', fontsize=12, color='red')
-plt.savefig("MemoryVSMeanSeqLength.png")
-plt.close()
-
-# Memory versus base count
-sns.regplot(x=df["input.total.length"], y=df.mem, line_kws={"color":"r","alpha":0.7,"lw":5})
-equation = getRegEquation(x=df["input.total.length"], y=df.mem)
-plt.xlabel('Total input sequences length (#bases)')
-plt.ylabel('Peak memory usage (GB)')
-plt.annotate(equation, xy=(0.05, 0.95), xycoords='axes fraction', fontsize=12, color='red')
-plt.savefig("MemoryVSSeqLength.png")
-plt.close()
-
-# PCA
-if False:
-    excluded_columns = ["pangenome.name", "chrom.id", "time"]
-    columns = [
-        col 
-        for col in df.columns.tolist()
-        if col not in excluded_columns
-    ]
-    crdf = df.copy()
-    crdf[columns] = (df[columns]-df[columns].mean())/df[columns].std()
-
-    if args.debug: print(crdf)
-
-    from sklearn.decomposition import PCA
-    pca = PCA(n_components=3)
-    newX = pca.fit_transform(crdf[columns])
-
-    pcaDF = pd.DataFrame(newX)
-    pcaDF.columns = ["D1", "D2", "D3"]
-    print(pcaDF)
-
-    finalDF = pd.concat([df[excluded_columns], pcaDF], axis = 1)
-    print(finalDF)
-
-    fig = sns.lmplot(x="D1", y="D2", hue="chrom.id", data = finalDF, fit_reg=False)
-    plt.savefig("Test.png")
\ No newline at end of file
-- 
GitLab


From 659f502a8eb97e36fe585c91f6fc70b881c703d6 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Mon, 4 Mar 2024 10:06:25 +0100
Subject: [PATCH 02/24] Update getPanacusHG.sh

Changed path parsing from odgi path to a simple grep
---
 scripts/getPanacusHG.sh | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/scripts/getPanacusHG.sh b/scripts/getPanacusHG.sh
index 25ba26b..41a2222 100755
--- a/scripts/getPanacusHG.sh
+++ b/scripts/getPanacusHG.sh
@@ -1,5 +1,6 @@
 #!/bin/bash
-# Get a GFA and returns a Panacus report
+# Create a panacus report in html from a given gfa
+# @author: alexis.mergez@inrae.fr
 
 # Initializing arguments
 gfa=""          # GFA path
@@ -27,14 +28,11 @@ done
 chrname=$(basename ${gfa} .gfa)
 ref=$(echo $refname | sed 's/.hap/#/')
 
-
 # Getting paths in chromosome graph
-echo "[getPanacusHG::odgi::paths] Running on ${gfa}"
-apptainer run --app odgi "${appdir}/PanGeTools.sif" paths -i ${gfa} -L | grep -ve "$ref" > ${chrdir}/$chrname.paths.noref.txt
+echo "[getPanacusHG::paths] Running on ${gfa}"
+grep '^P' $gfa | cut -f2 | grep -ve "$ref" > ${chrdir}/$chrname.paths.noref.txt
 
 # Running Panacus
 echo "[getPanacusHG::panacus] Running on ${gfa}"
 apptainer run --app panacus $appdir/PanGeTools.sif histgrowth \
     -t $threads -l 1,2,1,1,1 -q 0,0,1,0.5,0.1 -S -a -s ${chrdir}/$chrname.paths.noref.txt -c all -o html $gfa > ${output}
-
-
-- 
GitLab


From 4e95efadff4e55202366b916e88b5f0c64fff620 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Mon, 4 Mar 2024 10:07:50 +0100
Subject: [PATCH 03/24] Update workflowStats.py

Changed misleading column names in statistics files:
- time -> pggb.time
- cpu -> pggb.cpu
- mem -> pggb.mem
---
 scripts/workflowStats.py | 23 +++++++++--------------
 1 file changed, 9 insertions(+), 14 deletions(-)

diff --git a/scripts/workflowStats.py b/scripts/workflowStats.py
index fa9e4f3..a32048f 100644
--- a/scripts/workflowStats.py
+++ b/scripts/workflowStats.py
@@ -114,7 +114,7 @@ for tarball in args.tarballs:
             if args.debug: print(f"[timeStats::debug] Pangenome Name: {panname}\tChr: {chrid}\tTime: {time}\tCPU: {cpu}%\t memory: {memory}Gb")
 
             # Adding to aggregatedData
-            aggregatedData[(panname,chrid)] = {"time": time, "cpu": cpu, "mem": memory}
+            aggregatedData[(panname,chrid)] = {"pggb.time": time, "pggb.cpu": cpu, "pggb.mem": memory}
 
 ## Parsing chromosome input file size (in #bases).
 # Iterating over input length files
@@ -181,8 +181,8 @@ df.to_csv(args.output, sep='\t', index=False)
 
 # Creating some figures
 # Time versus base count
-sns.regplot(x=df["input.total.length"], y=df.time.dt.total_seconds(), line_kws={"color":"r","alpha":0.7,"lw":5})
-equation = getRegEquation(x=df["input.total.length"], y=df.time.dt.total_seconds())
+sns.regplot(x=df["input.total.length"], y=df["pggb.time"].dt.total_seconds(), line_kws={"color":"r","alpha":0.7,"lw":5})
+equation = getRegEquation(x=df["input.total.length"], y=df["pggb.time"].dt.total_seconds())
 plt.xlabel('Total input sequences length (#bases)')
 plt.ylabel('Graph creation time (s)')
 plt.annotate(equation, xy=(0.05, 0.95), xycoords='axes fraction', fontsize=12, color='red')
@@ -190,8 +190,8 @@ plt.savefig(os.path.join(args.figdir,"TimeVSSeqLength.png"))
 plt.close()
 
 # Memory versus base count
-sns.regplot(x=df["input.total.length"], y=df.mem, line_kws={"color":"r","alpha":0.7,"lw":5})
-equation = getRegEquation(x=df["input.total.length"], y=df.mem)
+sns.regplot(x=df["input.total.length"], y=["pggb.mem"], line_kws={"color":"r","alpha":0.7,"lw":5})
+equation = getRegEquation(x=df["input.total.length"], y=df["pggb.mem"])
 plt.xlabel('Total input sequences length (#bases)')
 plt.ylabel('Peak memory usage (GB)')
 plt.annotate(equation, xy=(0.05, 0.95), xycoords='axes fraction', fontsize=12, color='red')
@@ -199,8 +199,8 @@ plt.savefig(os.path.join(args.figdir,"MemoryVSSeqLength.png"))
 plt.close()
 
 # CPU versus base count
-sns.regplot(x=df["input.total.length"], y=df.cpu, line_kws={"color":"r","alpha":0.7,"lw":5})
-equation = getRegEquation(x=df["input.total.length"], y=df.cpu)
+sns.regplot(x=df["input.total.length"], y=df["pggb.cpu"], line_kws={"color":"r","alpha":0.7,"lw":5})
+equation = getRegEquation(x=df["input.total.length"], y=df["pggb.cpu"])
 plt.xlabel('Total input sequences length (#bases)')
 plt.ylabel('CPU usage (%)')
 plt.annotate(equation, xy=(0.05, 0.95), xycoords='axes fraction', fontsize=12, color='red')
@@ -208,15 +208,10 @@ plt.savefig(os.path.join(args.figdir,"CpuVSSeqLength.png"))
 plt.close()
 
 # Memory versus CPU
-sns.regplot(x=df.cpu, y=df.mem, line_kws={"color":"r","alpha":0.7,"lw":5})
-equation = getRegEquation(x=df.cpu, y=df.mem)
+sns.regplot(x=df["pggb.cpu"], y=df["pggb.mem"], line_kws={"color":"r","alpha":0.7,"lw":5})
+equation = getRegEquation(x=df["pggb.cpu"], y=df["pggb.mem"])
 plt.xlabel('CPU usage (%)')
 plt.ylabel('Peak memory usage (GB)')
 plt.annotate(equation, xy=(0.05, 0.95), xycoords='axes fraction', fontsize=12, color='red')
 plt.savefig(os.path.join(args.figdir,"MemoryVSCpu.png"))
 plt.close()
-
-
-
-
-
-- 
GitLab


From 8674d5417512005cdc14ed0a50014c8a6a5105d3 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Mon, 4 Mar 2024 17:13:36 +0100
Subject: [PATCH 04/24] Added metadata to final graph

---
 Snakefile          |  19 ++++++-
 scripts/getTags.py | 124 +++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 142 insertions(+), 1 deletion(-)
 create mode 100644 scripts/getTags.py

diff --git a/Snakefile b/Snakefile
index 260d97d..4bdc50b 100644
--- a/Snakefile
+++ b/Snakefile
@@ -22,7 +22,8 @@ def which_post_analysis():
     post_analysis_inputs = [     # Default post analysis steps
         "output/pan1c.pggb."+config['name']+".workflow.stats.tsv",
         "output/figures",
-        "output/panacus.reports"
+        "output/panacus.reports",
+        "output/pan1c.pggb."+config['name']+".gfa.metadata"
         ]
     
     # Optionals post analysis steps
@@ -261,6 +262,22 @@ rule workflow_statistics:
             -i {input.tar} -c {input.chrInputStats} -g {input.chrGraphStats} -o {output.tsv} -f {output.dir}
         """
 
+rule add_metadata:
+    # Add metadata to the final GFA
+    input:
+        graph="output/pan1c.pggb."+config['name']+".gfa",
+    output:
+        "output/pan1c.pggb."+config['name']+".gfa.metadata"
+    params:
+        apppath=config['app.path'],
+        panname=config['name']
+    shell:
+        """
+        apptainer run {params.apppath}/pan1c-env.sif python scripts/getTags.py \
+            --appdir {params.apppath} --config-file config.yaml > {output}
+        sed -i '/^H*/r {output}' {input.graph}
+        """
+
 rule panacus_stats:
     input:
         "data/chrGraphs/graphsList.txt"
diff --git a/scripts/getTags.py b/scripts/getTags.py
new file mode 100644
index 0000000..013f504
--- /dev/null
+++ b/scripts/getTags.py
@@ -0,0 +1,124 @@
+"""
+Tag list creation script for Pan1c workflow
+
+Returns a list of tags to ad at the top of the final gfa file as commented lines.
+
+@author: alexis.mergez@inrae.fr
+@version: 1.0 
+"""
+
+import argparse
+import subprocess
+import os
+import json
+
+## Arguments
+arg_parser = argparse.ArgumentParser(description='Tag list creation script')
+arg_parser.add_argument(
+    "--appdir",
+    "-a",
+    dest = "appdir",
+    required = True,
+    help = "Apptainer images directory"
+    )
+arg_parser.add_argument(
+    "--config-file",
+    "-c",
+    dest = "config",
+    required = True,
+    help = "Pan1c config file"
+    )
+args = arg_parser.parse_args()
+
+tags = {}
+
+## Pan1c-workflow section
+tags["Pan1c"] = {}
+_output = subprocess.run(
+    ["git", "describe", "--tags"],
+    capture_output=True,
+    text=True,
+).stdout[:-1]
+with open(args.config, 'r') as handle:
+    pggbCmd = [line[:-1] for line in handle.readlines() if "pggb.params" in line][0].split(': ')[-1]
+
+tags["Pan1c"]["pan1c.version"] = _output
+tags["Pan1c"]["pan1c.home"] = "https://forgemia.inra.fr/alexis.mergez/pan1c"
+tags["Pan1c"]["pan1c.pggb.args"] = pggbCmd
+
+## PanGeTools section
+tags["pangetools"] = {}
+_output = subprocess.run(
+    ["apptainer", "inspect", "-j", f"{args.appdir}/PanGeTools.sif"],
+    capture_output=True, 
+    text=True
+).stdout
+_output = json.loads(_output)
+labels = _output['data']['attributes']['labels']
+tags["pangetools"]["image.version"] = labels['Version']
+tags["pangetools"]["image.home"] = labels['about.home']
+
+for key in labels.keys():
+    if ".Version" in key:
+        tags["pangetools"][key.lower()] = labels[key]
+
+## PGGB image section
+tags["pggb"] = {}
+
+_output = subprocess.run(
+    ["apptainer", "inspect", "-j", f"{args.appdir}/pggb.sif"],
+    capture_output=True, 
+    text=True
+).stdout
+_output = json.loads(_output)
+labels = _output['data']['attributes']['labels']
+tags["pggb"]["image.version"] = labels['Version']
+tags["pggb"]["image.home"] = labels['about.home']
+
+for key in labels.keys():
+    if ".Version" in key:
+        tags["pggb"][key.lower()] = labels[key]
+
+## Pan1c-Env section
+tags["pan1c-env"] = {}
+
+_output = subprocess.run(
+    ["apptainer", "inspect", "-j", f"{args.appdir}/pan1c-env.sif"],
+    capture_output=True, 
+    text=True
+).stdout
+_output = json.loads(_output)
+labels = _output['data']['attributes']['labels']
+tags["pan1c-env"]["image.version"] = labels['Version']
+tags["pan1c-env"]["image.home"] = labels['about.home']
+
+for key in labels.keys():
+    if ".Version" in key:
+        tags["pan1c-env"][key.lower()] = labels[key]
+
+## Pan1c-Box section
+tags["pan1c-box"] = {}
+
+_output = subprocess.run(
+    ["apptainer", "inspect", "-j", f"{args.appdir}/pan1c-box.sif"],
+    capture_output=True, 
+    text=True
+).stdout
+_output = json.loads(_output)
+labels = _output['data']['attributes']['labels']
+tags["pan1c-box"]["image.version"] = labels['Version']
+tags["pan1c-box"]["image.home"] = labels['about.home']
+
+for key in labels.keys():
+    if ".Version" in key:
+        tags["pan1c-box"][key.lower()] = labels[key]
+
+## Exporting tags
+print("#\tThis graph have been created using the Pan1c workflow (https://forgemia.inra.fr/alexis.mergez/pan1c)\n#")
+print("#\tTool versions and commands\n#")
+for first_elem in tags.keys():
+    print(f'#\t-- {first_elem} --')
+    for label in tags[first_elem].keys():
+        print(f"#\t{first_elem}\t{label}: {tags[first_elem][label]}")
+    print('#')
+    
-- 
GitLab


From 18ee4e239a5b192537bda97e37a5be638b2520d7 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Mon, 4 Mar 2024 17:56:25 +0100
Subject: [PATCH 05/24] Renaming and clarifying steps

---
 Snakefile                                  | 77 +++++++++++--------
 scripts/{workflowStats.py => coreStats.py} |  2 +-
 scripts/inputStats.py                      | 86 ++++++++++++++++++++++
 3 files changed, 133 insertions(+), 32 deletions(-)
 rename scripts/{workflowStats.py => coreStats.py} (99%)
 create mode 100644 scripts/inputStats.py

diff --git a/Snakefile b/Snakefile
index 4bdc50b..3d7af83 100644
--- a/Snakefile
+++ b/Snakefile
@@ -20,10 +20,9 @@ with gzip.open("data/haplotypes/"+config['reference'], "r") as handle:
 def which_post_analysis():
     ## Simple function to configure which parts of the workflow needs to be run
     post_analysis_inputs = [     # Default post analysis steps
-        "output/pan1c.pggb."+config['name']+".workflow.stats.tsv",
+        "output/pan1c.pggb."+config['name']+".core.stats.tsv",
         "output/figures",
         "output/panacus.reports",
-        "output/pan1c.pggb."+config['name']+".gfa.metadata"
         ]
     
     # Optionals post analysis steps
@@ -35,6 +34,7 @@ def which_post_analysis():
 rule all:
     input:
         "output/pan1c.pggb."+config['name']+".gfa",
+        "output/pan1c.pggb."+config['name']+".gfa.metadata",
         "output/pan1c.pggb."+config['name']+".chrGraph.stats.tsv",
         which_post_analysis()
 
@@ -75,6 +75,10 @@ rule ragtag_scaffolding:
             -o {output}
         """
 
+"""
+Core section
+"""
+
 rule clustering:
     # Read alignment file to create bins for each chromosome
     input:
@@ -201,25 +205,24 @@ rule aggregate_stats:
             --input {input} --output {output} --panname {params.panname}
         """
 
-rule get_pav:
-    # Create PAV matrix readable by panache for a given chromosome scale graph
+rule gfaTagR:
+    # Add metadata to the final GFA
     input:
-        "data/chrGraphs/graphsList.txt"
+        graph="output/pan1c.pggb."+config['name']+".gfa",
     output:
-        directory("output/pav.matrices")
-    threads: 16
+        "output/pan1c.pggb."+config['name']+".gfa.metadata"
     params:
-        apppath=config['app.path']
-    run:
-        shell("mkdir {output}")
-        # Getting the list of graphs
-        with open(input[0]) as handle:
-            graphList = [graph.rstrip("\n") for graph in handle.readlines()]
-        # Iterating over graphs
-        for graph in graphList:
-            shell("bash scripts/getPanachePAV.sh -g {graph} -d data/chrGraphs/$(basename {graph} .gfa) -o {output}/$(basename {graph} .gfa).pav.matrix.tsv -a {params.apppath} -t {threads}")
+        apppath=config['app.path'],
+        panname=config['name']
+    shell:
+        """
+        apptainer run {params.apppath}/pan1c-env.sif python scripts/getTags.py \
+            --appdir {params.apppath} --config-file config.yaml > {output}
+        sed -i '/^H*/r {output}' {input.graph}
+        """
 
 rule pggb_log_compression:
+    # Compresses the logs of pggb into a tarball. (1 file to load in following steps) 
     input:
         flag="output/pan1c.pggb."+config['name']+".chrGraph.stats.tsv"
     output:
@@ -232,6 +235,7 @@ rule pggb_log_compression:
         """
 
 rule pggb_input_stats:
+    # Produces statistics on pggb input sequences
     input:
         flag="output/pan1c.pggb."+config['name']+".chrGraph.stats.tsv"
     output:
@@ -245,40 +249,51 @@ rule pggb_input_stats:
             -f data/chrInputs/*.fa.gz -o {output} -p {params.panname}
         """
 
-rule workflow_statistics:
+rule core_statistics:
+    # Combines all statistics from the input of pggb to their respective graphs 
     input:
         tar="logs/pan1c.pggb."+config['name']+".logs.tar.gz",
         chrInputStats="data/chrInputs/pan1c.pggb."+config['name']+".chrInput.stats.tsv",
         chrGraphStats="output/pan1c.pggb."+config['name']+".chrGraph.stats.tsv"
     output:
-        tsv="output/pan1c.pggb."+config['name']+".workflow.stats.tsv",
+        tsv="output/pan1c.pggb."+config['name']+".core.stats.tsv",
         dir=directory("output/pggb.usage.figs")
     params:
         apppath=config['app.path']
     shell:
         """
         mkdir -p {output.dir}
-        apptainer run {params.apppath}/pan1c-env.sif python scripts/workflowStats.py \
+        apptainer run {params.apppath}/pan1c-env.sif python scripts/coreStats.py \
             -i {input.tar} -c {input.chrInputStats} -g {input.chrGraphStats} -o {output.tsv} -f {output.dir}
         """
 
-rule add_metadata:
-    # Add metadata to the final GFA
+"""
+Post-processing section :
+    The graph for each chromosome are made as well as some basic statistics.
+    In this section, more stats are produced but more specifics ones requiring dedicated tools (Panacus, PAVs for Panache ...).
+    It also contains rules to use the graph itself. 
+"""
+
+rule get_pav:
+    # Create PAV matrix readable by panache for a given chromosome scale graph
     input:
-        graph="output/pan1c.pggb."+config['name']+".gfa",
+        "data/chrGraphs/graphsList.txt"
     output:
-        "output/pan1c.pggb."+config['name']+".gfa.metadata"
+        directory("output/pav.matrices")
+    threads: 16
     params:
-        apppath=config['app.path'],
-        panname=config['name']
-    shell:
-        """
-        apptainer run {params.apppath}/pan1c-env.sif python scripts/getTags.py \
-            --appdir {params.apppath} --config-file config.yaml > {output}
-        sed -i '/^H*/r {output}' {input.graph}
-        """
+        apppath=config['app.path']
+    run:
+        shell("mkdir {output}")
+        # Getting the list of graphs
+        with open(input[0]) as handle:
+            graphList = [graph.rstrip("\n") for graph in handle.readlines()]
+        # Iterating over graphs
+        for graph in graphList:
+            shell("bash scripts/getPanachePAV.sh -g {graph} -d data/chrGraphs/$(basename {graph} .gfa) -o {output}/$(basename {graph} .gfa).pav.matrix.tsv -a {params.apppath} -t {threads}")
 
 rule panacus_stats:
+    # Produces panacus reports for each chromosome graphs
     input:
         "data/chrGraphs/graphsList.txt"
     output:
diff --git a/scripts/workflowStats.py b/scripts/coreStats.py
similarity index 99%
rename from scripts/workflowStats.py
rename to scripts/coreStats.py
index a32048f..6e28c60 100644
--- a/scripts/workflowStats.py
+++ b/scripts/coreStats.py
@@ -1,5 +1,5 @@
 """
-Workflow statistics script for Pan1c workflow.
+Core statistics script for Pan1c workflow.
 
 Given one or more tarball containing time logs of the pggb rule of the workflow, 
 the script returns an aggregated table as well as figures    
diff --git a/scripts/inputStats.py b/scripts/inputStats.py
new file mode 100644
index 0000000..e102b11
--- /dev/null
+++ b/scripts/inputStats.py
@@ -0,0 +1,86 @@
+"""
+Input statistics script for Pan1c workflow
+
+This scripts produces statistics of the input haplotypes such as kmers counts and lengths. 
+
+@author: alexis.mergez@inrae.fr
+@version: 1.0 
+"""
+
+from Bio import SeqIO
+import numpy as np
+import argparse
+import gzip
+import os
+
+## Arguments
+arg_parser = argparse.ArgumentParser(description='Pan1c input haplotype statistics')
+arg_parser.add_argument(
+    "--fasta",
+    "-f",
+    dest = "fastafile",
+    required = True,
+    help = "Fasta file"
+    )
+arg_parser.add_argument(
+    "--output",
+    "-o",
+    dest = "outdir",
+    required = True,
+    help = "Output directory"
+    )
+arg_parser.add_argument(
+    "--debug",
+    "-d",
+    action="store_true",
+    dest = "debug",
+    help = "Show debug"
+    )
+args = arg_parser.parse_args()
+
+## Toolbox
+def getChr(name):
+    """
+    Simply returns the chromosome name of a given fasta record
+    """
+    return name.split('#')[-1]
+
+## Main script
+seqDict = {}
+
+# Parsing fasta files
+for filename in args.fastafiles:
+    
+    # Reading .fa.gz file and adding records to seqDict
+    with gzip.open(filename, "rt") as handle:
+        sequences = SeqIO.parse(handle, "fasta")
+        
+        for record in sequences:
+            seqDict[record.id] = record
+
+# Inferring the list of available chromosomes from the sequence record ids
+chrList = np.unique([
+    getChr(recordid) for recordid in seqDict.keys()
+])
+
+# Clustering records based on chromosome tag in records id
+chrSeq = {}
+
+for recordid, record in seqDict.items():
+    # Getting the chromosome id
+    _chrname = getChr(recordid)
+
+    # If not encountered before, create a key for this chromosome in chrSeq
+    if _chrname not in list(chrSeq.keys()):
+        chrSeq[_chrname] = []
+
+    # Add the record to the chromosome bin in chrSeq
+    chrSeq[_chrname].append(record)
+
+# Debug purpose print
+if args.debug: print(chrSeq.keys())
+
+# Writing chromosome specific fasta file
+for chrName in chrSeq.keys():
+    with open(os.path.join(args.outdir, f"{chrName}.fa"), "w") as output_handle:
+        SeqIO.write(chrSeq[chrName], output_handle, "fasta")
\ No newline at end of file
-- 
GitLab


From 22223617f092ee4ffc706d7fd5c4ff1f22aaa70e Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Tue, 5 Mar 2024 10:47:37 +0100
Subject: [PATCH 06/24] Renamed statsAggregation to chrStatsAggregation

---
 Snakefile                                               | 6 +++---
 scripts/{statsAggregation.py => chrStatsAggregation.py} | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)
 rename scripts/{statsAggregation.py => chrStatsAggregation.py} (96%)

diff --git a/Snakefile b/Snakefile
index 3d7af83..6d0761e 100644
--- a/Snakefile
+++ b/Snakefile
@@ -190,8 +190,8 @@ rule graph_figs:
             shell("apptainer run --app odgi {params.apppath}/PanGeTools.sif viz -i {graph} -o {output}/$(basename {graph} .gfa).1Dviz.png {params.oneDviz} -t {threads} -P")
             shell("apptainer run --app odgi {params.apppath}/PanGeTools.sif viz -i {graph} -o {output}/$(basename {graph} .gfa).pcov.png {params.pcov} -t {threads} -P")
 
-rule aggregate_stats:
-    # Reading and merging all stats files into a final one called aggregatedStats.tsv
+rule aggregate_graphs_stats:
+    # Reading and merging all stats files from chromosome graphs into a .tsv.
     input:
         "output/stats/"
     output:
@@ -201,7 +201,7 @@ rule aggregate_stats:
         panname=config['name']
     shell:
         """
-        apptainer run {params.apppath}/pan1c-env.sif python scripts/statsAggregation.py \
+        apptainer run {params.apppath}/pan1c-env.sif python scripts/chrStatsAggregation.py \
             --input {input} --output {output} --panname {params.panname}
         """
 
diff --git a/scripts/statsAggregation.py b/scripts/chrStatsAggregation.py
similarity index 96%
rename from scripts/statsAggregation.py
rename to scripts/chrStatsAggregation.py
index c399b6d..e3dbc14 100644
--- a/scripts/statsAggregation.py
+++ b/scripts/chrStatsAggregation.py
@@ -1,5 +1,5 @@
 """
-Statistics aggregator for Pan1c workflow
+Chromosomes statistics aggregator for Pan1c workflow
 
 Aggregate chromosome level graph statistics into a single TSV.
 
-- 
GitLab


From aeb8c7e64535d32a63897c2272f90ba4bfb0ad70 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Tue, 5 Mar 2024 11:16:13 +0100
Subject: [PATCH 07/24] Fixed chrInputs fasta

Fixed fasta format from fixed columns sequences to 1 line sequences
---
 scripts/inputClustering.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/scripts/inputClustering.py b/scripts/inputClustering.py
index 8ceed34..b3688bb 100644
--- a/scripts/inputClustering.py
+++ b/scripts/inputClustering.py
@@ -10,6 +10,7 @@ Each output fasta contains sequences related to one chromosome only.
 """
 
 from Bio import SeqIO
+from Bio.SeqIO import FastaIO
 import numpy as np
 import argparse
 import gzip
@@ -86,4 +87,5 @@ if args.debug: print(chrSeq.keys())
 # Writing chromosome specific fasta file
 for chrName in chrSeq.keys():
     with open(os.path.join(args.outdir, f"{chrName}.fa"), "w") as output_handle:
-        SeqIO.write(chrSeq[chrName], output_handle, "fasta")
\ No newline at end of file
+        fasta_out = FastaIO.FastaWriter(output_handle, wrap=None)
+        fasta_out.write_file(chrSeq[chrName])  
\ No newline at end of file
-- 
GitLab


From 3a5de54f38b92a2c495ba8131d0abd9ce0c8d89d Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Tue, 5 Mar 2024 14:09:08 +0100
Subject: [PATCH 08/24] Update coreStats.py

---
 scripts/coreStats.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/coreStats.py b/scripts/coreStats.py
index 6e28c60..a4c31fc 100644
--- a/scripts/coreStats.py
+++ b/scripts/coreStats.py
@@ -169,7 +169,7 @@ df = pd.DataFrame.from_dict(aggregatedData, orient='index')
 df.reset_index(inplace=True)
 df.rename(columns={'level_0': 'pangenome.name', 'level_1': 'chrom.id'}, inplace=True)
 df.rename(columns=graphColDict, inplace = True)
-df.time = pd.to_timedelta(df.time)
+df["pggb.time"] = pd.to_timedelta(df["pggb.time"])
 df.mem = df.mem.astype(float)
 df.cpu = df.cpu.astype(int)
 df["input.total.length"] = df["input.total.length"].astype(int)
-- 
GitLab


From 72d04a026f9fe7520ee448e04198ed6dc4e107ba Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Tue, 5 Mar 2024 14:09:11 +0100
Subject: [PATCH 09/24] Update Snakefile

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 6d0761e..8d8687d 100644
--- a/Snakefile
+++ b/Snakefile
@@ -174,7 +174,7 @@ rule graph_figs:
     input:
         "data/chrGraphs/graphsList.txt"
     output:
-        directory("output/figures")
+        directory("output/chrGraphs.figs")
     threads: 4
     params:
         apppath=config['app.path'],
-- 
GitLab


From e4fcd1dc7869c54f6a0dd92a9e5ea2e5448eb9d6 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Tue, 5 Mar 2024 14:22:32 +0100
Subject: [PATCH 10/24] Update coreStats.py

---
 scripts/coreStats.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/coreStats.py b/scripts/coreStats.py
index a4c31fc..e6b555f 100644
--- a/scripts/coreStats.py
+++ b/scripts/coreStats.py
@@ -170,8 +170,8 @@ df.reset_index(inplace=True)
 df.rename(columns={'level_0': 'pangenome.name', 'level_1': 'chrom.id'}, inplace=True)
 df.rename(columns=graphColDict, inplace = True)
 df["pggb.time"] = pd.to_timedelta(df["pggb.time"])
-df.mem = df.mem.astype(float)
-df.cpu = df.cpu.astype(int)
+df["pggb.mem"] = df["pggb.mem"].astype(float)
+df["pggb.cpu"] = df["pggb.cpu"].astype(int)
 df["input.total.length"] = df["input.total.length"].astype(int)
 
 if args.debug: print("[timeStats::debug]", df.dtypes)
-- 
GitLab


From 87d65fd6cc8037b5e90fcfe78e3f4fc07a18c2c1 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Tue, 5 Mar 2024 14:47:57 +0100
Subject: [PATCH 11/24] Update coreStats.py

---
 scripts/coreStats.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/scripts/coreStats.py b/scripts/coreStats.py
index e6b555f..aa1b160 100644
--- a/scripts/coreStats.py
+++ b/scripts/coreStats.py
@@ -190,7 +190,7 @@ plt.savefig(os.path.join(args.figdir,"TimeVSSeqLength.png"))
 plt.close()
 
 # Memory versus base count
-sns.regplot(x=df["input.total.length"], y=["pggb.mem"], line_kws={"color":"r","alpha":0.7,"lw":5})
+sns.regplot(x=df["input.total.length"], y=df["pggb.mem"], line_kws={"color":"r","alpha":0.7,"lw":5})
 equation = getRegEquation(x=df["input.total.length"], y=df["pggb.mem"])
 plt.xlabel('Total input sequences length (#bases)')
 plt.ylabel('Peak memory usage (GB)')
-- 
GitLab


From ec716b85dc35d8ff09d92689d69da7a2da9ff190 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Wed, 6 Mar 2024 08:57:26 +0100
Subject: [PATCH 12/24] Update Snakefile

---
 Snakefile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/Snakefile b/Snakefile
index 8d8687d..818908c 100644
--- a/Snakefile
+++ b/Snakefile
@@ -21,7 +21,7 @@ def which_post_analysis():
     ## Simple function to configure which parts of the workflow needs to be run
     post_analysis_inputs = [     # Default post analysis steps
         "output/pan1c.pggb."+config['name']+".core.stats.tsv",
-        "output/figures",
+        "output/chrGraphs.figs",
         "output/panacus.reports",
         ]
     
-- 
GitLab


From c0ad531ddcad411f44037e4ba130aa453c83291f Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Wed, 6 Mar 2024 16:33:43 +0100
Subject: [PATCH 13/24] Added SyriFigs generator script

Added script to generate syri figs from fasta files
Added syri figure for the all haplotypes (all in one)
---
 Snakefile              | 55 +++++++++++++++++++++++++++++++
 scripts/getSyriFigs.sh | 74 ++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 129 insertions(+)
 create mode 100755 scripts/getSyriFigs.sh

diff --git a/Snakefile b/Snakefile
index 818908c..ecee517 100644
--- a/Snakefile
+++ b/Snakefile
@@ -33,6 +33,7 @@ def which_post_analysis():
 
 rule all:
     input:
+        "output/asm.syri.figs/pan1c."+config['name']+".asm.syri.png",
         "output/pan1c.pggb."+config['name']+".gfa",
         "output/pan1c.pggb."+config['name']+".gfa.metadata",
         "output/pan1c.pggb."+config['name']+".chrGraph.stats.tsv",
@@ -52,6 +53,10 @@ rule samtools_index:
         "apptainer run --app samtools {params.apppath}/PanGeTools.sif "
         "faidx {input.fa}"
 
+"""
+Pre-processing section
+"""
+
 rule ragtag_scaffolding:
     # Scaffold input haplotype against the reference to infer chromosome scale sequences
     input:
@@ -75,6 +80,29 @@ rule ragtag_scaffolding:
             -o {output}
         """
 
+rule create_asm_syri_fig:
+    input:
+        ref="data/hap.ragtagged/"+config['reference'][:-5]+"ragtagged.fa.gz",
+        qry=expand('data/hap.ragtagged/{haplotype}.ragtagged.fa.gz', haplotype=SAMPLES)
+    output:
+        fig="output/asm.syri.figs/pan1c."+config['name']+".asm.syri.png",
+        wrkdir=directory('data/asm.syri')
+    threads: 8
+    params:
+        apppath=config["app.path"]
+    shell:
+        """
+        mkdir {output.wrkdir}
+        bash scripts/getSyriFigs.sh \
+            -a {params.apppath} \
+            -t {threads} \
+            -d {output.wrkdir} \
+            -o $(basename {output.fig}) \
+            -r {input.ref} \
+            -q "{input.qry}"
+        mv {output.wrkdir}/$(basename {output.fig}) {output.fig}
+        """
+
 """
 Core section
 """
@@ -98,6 +126,33 @@ rule clustering:
         done
         """
 
+rule create_pggb_input_syri_fig:
+    input:
+        fasta='data/chrInputs/{chromosome}.fa.gz'
+    output:
+        fig="output/chrInput.syri.figs/{chromosome}."+config['name']+".asm.syri.png",
+        wrkdir=directory('data/chrInput/syri/{chromosome}')
+    threads: 8
+    params:
+        apppath=config["app.path"],
+        ref=config['reference']
+    shell:
+        """
+        mkdir {output.wrkdir}
+        refname=$(basename {params.ref} .fa.gz | cut -d'.' -f1) 
+        zgrep -A1 ">${refname}*" {input.fasta} > {output.wrkdir}/refseq.fa
+        zgrep -v -A1 ">${refname}*" {input.fasta} | grep -v '^--$' > {output.wrkdir}/qryseq.fa
+
+        bash scripts/getSyriFigs.sh \
+            -a {params.apppath} \
+            -t {threads} \
+            -d {output.wrkdir} \
+            -o $(basename {output.fig}) \
+            -r {input.ref} \
+            -q "{input.qry}"
+        mv {output.wrkdir}/$(basename {output.fig}) {output.fig}
+        """
+
 rule pggb_on_chr:
     # Runs pggb on a specific chromosome
     input:
diff --git a/scripts/getSyriFigs.sh b/scripts/getSyriFigs.sh
new file mode 100755
index 0000000..6708c42
--- /dev/null
+++ b/scripts/getSyriFigs.sh
@@ -0,0 +1,74 @@
+#!/bin/bash
+# Create a Syri figure for the given genomes
+# @author: alexis.mergez@inrae.fr
+
+# Initializing arguments
+ref=""          # Reference fasta
+qry=""          # Queries fasta
+appdir=""       # Directory containing apptainer images
+threads=""      
+wrkdir=""       # Working directory (directory used by pggb to store step files like .paf, etc...)
+output=""       # Output Syri figure(s)
+
+## Getting arguments
+while getopts "r:q:a:t:d:o:" option; do
+    case "$option" in
+        r) ref="$OPTARG";;
+        q) qry="$OPTARG";;
+        a) appdir="$OPTARG";;
+        t) threads="$OPTARG";;
+        d) wrkdir="$OPTARG";;
+        o) output="$OPTARG";;
+        \?) echo "Usage: $0 [-r ref] [-q query] [-a appdir] [-t threads] [-d wrkdir] [-o output]" >&2
+            exit 1;;
+    esac
+done
+
+IFS=' ' read -r -a temp <<< "$qry"
+#echo "${qryList[@]}"
+
+asmList=("$ref")
+## Sorting the array to put the reference in first
+for item in "${temp[@]}"; do
+    if [[ $item != "$ref" ]]; then
+        asmList+=($item)
+    fi
+done
+
+#asmList=("${ref}" "${qryList[@]}")
+echo "${asmList[@]}"
+syriFileList=()
+
+for ((i = 0; i < ${#asmList[@]} - 1; i++)); do
+
+    samFile="$(basename ${asmList[i]} .fa.gz)_$(basename ${asmList[i + 1]} .fa.gz).sam"
+    syriFile="$(basename ${asmList[i]} .fa.gz)_$(basename ${asmList[i + 1]} .fa.gz).syri.out"
+    syriFileList+=($syriFile)
+
+    ##Â Minimap2 genome vs genome alignment
+    apptainer run --app minimap2 $appdir/PanGeTools.sif \
+        -ax asm5 --eqx ${asmList[i]} ${asmList[i + 1]} -t $threads > \
+        $wrkdir/$samFile
+
+    echo "${asmList[i]}"
+    ## Syri on previous alignment
+    apptainer run $appdir/pan1c-env.sif \
+        syri -c $wrkdir/$samFile -r ${asmList[i]} -q ${asmList[i + 1]} -k -F S \
+        --nc $threads \
+        --dir $wrkdir --prefix "$(basename ${asmList[i]} .fa.gz)_$(basename ${asmList[i + 1]} .fa.gz)."    
+done
+
+## Creating genomes.txt
+echo -e "#files\tname" > $wrkdir/genomes.txt
+for asm in "${asmList[@]}"; do
+    echo -e "${asm}\t$(basename $asm .fa.gz | cut -d'.' -f1)" >> $wrkdir/genomes.txt
+done
+
+## Generating the list of syri files for the plotsr command
+command="--genomes $wrkdir/genomes.txt -o $wrkdir/$output -f 12 -H 16 -W 9 -d 600 "
+for file in "${syriFileList[@]}"; do
+    command+="--sr $wrkdir/$file "
+done
+
+apptainer run $appdir/pan1c-env.sif plotsr \
+    $command
-- 
GitLab


From b50d9ec27802a52bc7312cf4ccf011c9f42fa910 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Wed, 6 Mar 2024 17:24:59 +0100
Subject: [PATCH 14/24] Update Snakefile

Working on Syri figures for pggb inputs
---
 Snakefile | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/Snakefile b/Snakefile
index ecee517..83b413a 100644
--- a/Snakefile
+++ b/Snakefile
@@ -139,18 +139,28 @@ rule create_pggb_input_syri_fig:
     shell:
         """
         mkdir {output.wrkdir}
-        refname=$(basename {params.ref} .fa.gz | cut -d'.' -f1) 
-        zgrep -A1 ">${refname}*" {input.fasta} > {output.wrkdir}/refseq.fa
-        zgrep -v -A1 ">${refname}*" {input.fasta} | grep -v '^--$' > {output.wrkdir}/qryseq.fa
+        refname=$(basename {params.ref} .fa.gz | cut -d'.' -f1)
+
+        ## Creating single fasta from multifasta
+        zcat {input.fasta} | awk -F"#" \
+            '/^>/ {OUT="{output.wrkdir}" substr($0,2) ".fa"}; {print >> OUT; close(OUT)}'
+        
+        ## Getting the list of sequences
+        asmList=()
+        for file in {output.wrkdir}/*.fa; do
+            asm="$(basename $file .fa | cut -f1 -d"#").fa"
+            mv $file "$(dirname $file)/$asm"
+            asmList+=("$asm")
 
         bash scripts/getSyriFigs.sh \
             -a {params.apppath} \
             -t {threads} \
             -d {output.wrkdir} \
             -o $(basename {output.fig}) \
-            -r {input.ref} \
-            -q "{input.qry}"
+            -r {output.wrkdir}/"${refname}.fa" \
+            -q "${asmList[@]}"
         mv {output.wrkdir}/$(basename {output.fig}) {output.fig}
+        rm {output.wrkdir}/*.fa
         """
 
 rule pggb_on_chr:
-- 
GitLab


From b05b3494cf844eccc2612ef3c488c1359d093176 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Thu, 7 Mar 2024 16:12:42 +0100
Subject: [PATCH 15/24] Using BAM instead of SAM for SyRI

---
 scripts/getSyriFigs.sh | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/scripts/getSyriFigs.sh b/scripts/getSyriFigs.sh
index 6708c42..dfeef3e 100755
--- a/scripts/getSyriFigs.sh
+++ b/scripts/getSyriFigs.sh
@@ -41,19 +41,19 @@ syriFileList=()
 
 for ((i = 0; i < ${#asmList[@]} - 1; i++)); do
 
-    samFile="$(basename ${asmList[i]} .fa.gz)_$(basename ${asmList[i + 1]} .fa.gz).sam"
+    bamFile="$(basename ${asmList[i]} .fa.gz)_$(basename ${asmList[i + 1]} .fa.gz).bam"
     syriFile="$(basename ${asmList[i]} .fa.gz)_$(basename ${asmList[i + 1]} .fa.gz).syri.out"
     syriFileList+=($syriFile)
 
     ##Â Minimap2 genome vs genome alignment
     apptainer run --app minimap2 $appdir/PanGeTools.sif \
-        -ax asm5 --eqx ${asmList[i]} ${asmList[i + 1]} -t $threads > \
-        $wrkdir/$samFile
+        -ax asm5 --eqx ${asmList[i]} ${asmList[i + 1]} -t $threads | \
+        apptainer run --app samtools $appdir/PanGeTools.sif sort -O BAM -@ $threads - > $wrkdir/$bamFile
 
     echo "${asmList[i]}"
     ## Syri on previous alignment
     apptainer run $appdir/pan1c-env.sif \
-        syri -c $wrkdir/$samFile -r ${asmList[i]} -q ${asmList[i + 1]} -k -F S \
+        syri -c $wrkdir/$bamFile -r ${asmList[i]} -q ${asmList[i + 1]} -k -F B \
         --nc $threads \
         --dir $wrkdir --prefix "$(basename ${asmList[i]} .fa.gz)_$(basename ${asmList[i + 1]} .fa.gz)."    
 done
-- 
GitLab


From 6c38e42a2994dc7a568e34e15f62558699123199 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Thu, 7 Mar 2024 16:13:09 +0100
Subject: [PATCH 16/24] Removed unused package in getTags.py

---
 scripts/getTags.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/scripts/getTags.py b/scripts/getTags.py
index 013f504..b01c5e3 100644
--- a/scripts/getTags.py
+++ b/scripts/getTags.py
@@ -10,7 +10,6 @@ Returns a list of tags to ad at the top of the final gfa file as commented lines
 import argparse
 import subprocess
 import os
-import json
 
 ## Arguments
 arg_parser = argparse.ArgumentParser(description='Tag list creation script')
-- 
GitLab


From ad35dec1480dd4805bcbc6387cdb645ea9986bd5 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Thu, 7 Mar 2024 16:14:41 +0100
Subject: [PATCH 17/24] Update Snakefile

Moving away from graphList
Added SyRI for single assembly comparison
---
 Snakefile | 69 ++++++++++++++++++++++++++++++++++++++++---------------
 1 file changed, 51 insertions(+), 18 deletions(-)

diff --git a/Snakefile b/Snakefile
index 83b413a..7a710c4 100644
--- a/Snakefile
+++ b/Snakefile
@@ -11,6 +11,11 @@ SAMPLES = np.unique([
     for f in os.listdir("data/haplotypes/")
     if re.search(r".fa", f)
     ])
+SAMPLES_NOREF = [
+    sample 
+    for sample in SAMPLES 
+    if sample != os.path.basename(config['reference']).split('.fa')[0]
+    ]
 nHAP = len(SAMPLES)
 
 # Getting the list of chromosomes
@@ -22,7 +27,7 @@ def which_post_analysis():
     post_analysis_inputs = [     # Default post analysis steps
         "output/pan1c.pggb."+config['name']+".core.stats.tsv",
         "output/chrGraphs.figs",
-        "output/panacus.reports",
+        expand("output/panacus.reports/{chromosome}.histgrowth.html", chromosome=CHRLIST)
         ]
     
     # Optionals post analysis steps
@@ -33,7 +38,8 @@ def which_post_analysis():
 
 rule all:
     input:
-        "output/asm.syri.figs/pan1c."+config['name']+".asm.syri.png",
+        "output/asm.syri.figs/pan1c."+config['name']+".allAsm.syri.png",
+        expand("output/asm.syri.figs/pan1c."+config['name']+".{haplotype}.syri.png", haplotype=SAMPLES_NOREF),
         "output/pan1c.pggb."+config['name']+".gfa",
         "output/pan1c.pggb."+config['name']+".gfa.metadata",
         "output/pan1c.pggb."+config['name']+".chrGraph.stats.tsv",
@@ -80,13 +86,14 @@ rule ragtag_scaffolding:
             -o {output}
         """
 
-rule create_asm_syri_fig:
+rule create_allAsm_syri_fig:
+    # Create a SyRI figure containing all input assemblies
     input:
         ref="data/hap.ragtagged/"+config['reference'][:-5]+"ragtagged.fa.gz",
         qry=expand('data/hap.ragtagged/{haplotype}.ragtagged.fa.gz', haplotype=SAMPLES)
     output:
-        fig="output/asm.syri.figs/pan1c."+config['name']+".asm.syri.png",
-        wrkdir=directory('data/asm.syri')
+        fig="output/asm.syri.figs/pan1c."+config['name']+".allAsm.syri.png",
+        wrkdir=directory('data/asm.syri/all')
     threads: 8
     params:
         apppath=config["app.path"]
@@ -103,6 +110,30 @@ rule create_asm_syri_fig:
         mv {output.wrkdir}/$(basename {output.fig}) {output.fig}
         """
 
+rule create_sglAsm_syri_fig:
+    # Create a SyRI figure for a single input assembly
+    input:
+        ref="data/hap.ragtagged/"+config['reference'][:-5]+"ragtagged.fa.gz",
+        qry="data/hap.ragtagged/{haplotype}.ragtagged.fa.gz"
+    output:
+        fig="output/asm.syri.figs/pan1c."+config['name']+".{haplotype}.syri.png",
+        wrkdir=directory('data/asm.syri/{haplotype}')
+    threads: 4
+    params:
+        apppath=config["app.path"]
+    shell:
+        """
+        mkdir -p {output.wrkdir}
+        bash scripts/getSyriFigs.sh \
+            -a {params.apppath} \
+            -t {threads} \
+            -d {output.wrkdir} \
+            -o $(basename {output.fig}) \
+            -r {input.ref} \
+            -q "{input.qry}"
+        mv {output.wrkdir}/$(basename {output.fig}) {output.fig}
+        """
+
 """
 Core section
 """
@@ -281,9 +312,9 @@ rule gfaTagR:
         panname=config['name']
     shell:
         """
-        apptainer run {params.apppath}/pan1c-env.sif python scripts/getTags.py \
+        python scripts/getTags.py \
             --appdir {params.apppath} --config-file config.yaml > {output}
-        sed -i '/^H*/r {output}' {input.graph}
+        sed -i '/^H/r {output}' {input.graph}
         """
 
 rule pggb_log_compression:
@@ -358,21 +389,23 @@ rule get_pav:
             shell("bash scripts/getPanachePAV.sh -g {graph} -d data/chrGraphs/$(basename {graph} .gfa) -o {output}/$(basename {graph} .gfa).pav.matrix.tsv -a {params.apppath} -t {threads}")
 
 rule panacus_stats:
-    # Produces panacus reports for each chromosome graphs
+    # Produces panacus reports for a chromosome graph
     input:
-        "data/chrGraphs/graphsList.txt"
+        graph="data/chrGraphs/{chromosome}.gfa"
     output:
-        directory("output/panacus.reports")
+        html="output/panacus.reports/{chromosome}.histgrowth.html"
     params:
         apppath=config['app.path'],
         panname=config['name'],
         refname=config['reference']
     threads: 8
-    run:
-        shell("mkdir {output}")
-        # Getting the list of graphs
-        with open(input[0]) as handle:
-            graphList = [graph.rstrip("\n") for graph in handle.readlines()]
-        # Iterating over graphs
-        for graph in graphList:
-            shell("bash scripts/getPanacusHG.sh -g {graph} -r $(basename {params.refname} .fa.gz) -d data/chrGraphs/$(basename {graph} .gfa) -o {output}/$(basename {graph} .gfa).histgrowth.html -a {params.apppath} -t {threads}")
+    shell:
+        """
+        bash scripts/getPanacusHG.sh \
+            -g {input.graph} \
+            -r $(basename {params.refname} .fa.gz) \
+            -d data/chrGraphs/$(basename {input.graph} .gfa) \
+            -o {output.html} \
+            -a {params.apppath} \
+            -t {threads}
+        """
-- 
GitLab


From 07f92057d537b0b5bae7a750913e605407bffc62 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Fri, 8 Mar 2024 15:57:50 +0100
Subject: [PATCH 18/24] Fixed missing import

---
 scripts/getTags.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/scripts/getTags.py b/scripts/getTags.py
index b01c5e3..013f504 100644
--- a/scripts/getTags.py
+++ b/scripts/getTags.py
@@ -10,6 +10,7 @@ Returns a list of tags to ad at the top of the final gfa file as commented lines
 import argparse
 import subprocess
 import os
+import json
 
 ## Arguments
 arg_parser = argparse.ArgumentParser(description='Tag list creation script')
-- 
GitLab


From 1a98f77c425addfb6aa5a17cfdd1d5d1ac03ace4 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Mon, 11 Mar 2024 12:23:25 +0100
Subject: [PATCH 19/24] Moving out of graphList requirement

---
 Snakefile | 47 +++++++++++++++++++++++------------------------
 1 file changed, 23 insertions(+), 24 deletions(-)

diff --git a/Snakefile b/Snakefile
index 7a710c4..9ed967b 100644
--- a/Snakefile
+++ b/Snakefile
@@ -26,7 +26,6 @@ def which_post_analysis():
     ## Simple function to configure which parts of the workflow needs to be run
     post_analysis_inputs = [     # Default post analysis steps
         "output/pan1c.pggb."+config['name']+".core.stats.tsv",
-        "output/chrGraphs.figs",
         expand("output/panacus.reports/{chromosome}.histgrowth.html", chromosome=CHRLIST)
         ]
     
@@ -40,6 +39,7 @@ rule all:
     input:
         "output/asm.syri.figs/pan1c."+config['name']+".allAsm.syri.png",
         expand("output/asm.syri.figs/pan1c."+config['name']+".{haplotype}.syri.png", haplotype=SAMPLES_NOREF),
+        expand("output/chrGraphs.figs/{chromosome}.1Dviz.png", chromosome=CHRLIST),
         "output/pan1c.pggb."+config['name']+".gfa",
         "output/pan1c.pggb."+config['name']+".gfa.metadata",
         "output/pan1c.pggb."+config['name']+".chrGraph.stats.tsv",
@@ -250,46 +250,45 @@ rule graph_squeeze:
 rule graph_stats:
     # Using odgi to produce stats on every chromosome scale graph
     input:
-        "data/chrGraphs/graphsList.txt"
+        graph="data/chrGraphs/{chromosome}.gfa"
     output:
-        directory("output/stats")
+        "output/chrGraphs.stats/{chromosome}.stats.tsv"
     threads: 4
     params:
         apppath=config['app.path']
-    run:
-        shell("mkdir {output}")
-        # Getting the list of graphs
-        with open(input[0]) as handle:
-            graphList = [graph.rstrip("\n") for graph in handle.readlines()]
-        # Iterating over graphs
-        for graph in graphList:
-            shell("apptainer run --app odgi {params.apppath}/PanGeTools.sif stats -S -t {threads} -P -i {graph} > {output}/$(basename {graph} .gfa).stats.tsv")
+    shell:
+        """
+        apptainer run --app odgi {params.apppath}/PanGeTools.sif stats \
+            -S -t {threads} -P -i {input.graph} > {output}
+        """
 
 rule graph_figs:
     # Creating figures using odgi viz 
     input:
-        "data/chrGraphs/graphsList.txt"
+        graph="data/chrGraphs/{chromosome}.gfa"
     output:
-        directory("output/chrGraphs.figs")
+        oneDviz="output/chrGraphs.figs/{chromosome}.1Dviz.png",
+        pcov="output/chrGraphs.figs/{chromosome}.pcov.png"
     threads: 4
     params:
         apppath=config['app.path'],
         oneDviz=config['odgi.1Dviz.params'],
         pcov=config['odgi.pcov.params']
-    run:
-        shell("mkdir {output}")
-        # Getting the list of graphs
-        with open(input[0]) as handle:
-            graphList = [graph.rstrip("\n") for graph in handle.readlines()]
-        # Iterating over graphs
-        for graph in graphList:
-            shell("apptainer run --app odgi {params.apppath}/PanGeTools.sif viz -i {graph} -o {output}/$(basename {graph} .gfa).1Dviz.png {params.oneDviz} -t {threads} -P")
-            shell("apptainer run --app odgi {params.apppath}/PanGeTools.sif viz -i {graph} -o {output}/$(basename {graph} .gfa).pcov.png {params.pcov} -t {threads} -P")
+    shell:
+        """
+        ## 1D viz
+        apptainer run --app odgi {params.apppath}/PanGeTools.sif \
+            viz -i {input.graph} -o {output.oneDviz} {params.oneDviz} -t {threads} -P
+
+        ## Pcov viz
+        apptainer run --app odgi {params.apppath}/PanGeTools.sif \
+            viz -i {input.graph} -o {output.pcov} {params.pcov} -t {threads} -P
+        """
 
 rule aggregate_graphs_stats:
     # Reading and merging all stats files from chromosome graphs into a .tsv.
     input:
-        "output/stats/"
+        expand("output/chrGraphs.stats/{chromosome}.stats.tsv", chromosome=CHRLIST)
     output:
         "output/pan1c.pggb."+config['name']+".chrGraph.stats.tsv"
     params:
@@ -298,7 +297,7 @@ rule aggregate_graphs_stats:
     shell:
         """
         apptainer run {params.apppath}/pan1c-env.sif python scripts/chrStatsAggregation.py \
-            --input {input} --output {output} --panname {params.panname}
+            --input $(dirname {input[0]}) --output {output} --panname {params.panname}
         """
 
 rule gfaTagR:
-- 
GitLab


From bba873d55564ccce5ded956c6d04ef62942cae60 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Mon, 11 Mar 2024 12:29:43 +0100
Subject: [PATCH 20/24] Update Snakefile

---
 Snakefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/Snakefile b/Snakefile
index 9ed967b..9521960 100644
--- a/Snakefile
+++ b/Snakefile
@@ -252,14 +252,14 @@ rule graph_stats:
     input:
         graph="data/chrGraphs/{chromosome}.gfa"
     output:
-        "output/chrGraphs.stats/{chromosome}.stats.tsv"
+        stats="output/chrGraphs.stats/{chromosome}.stats.tsv"
     threads: 4
     params:
         apppath=config['app.path']
     shell:
         """
         apptainer run --app odgi {params.apppath}/PanGeTools.sif stats \
-            -S -t {threads} -P -i {input.graph} > {output}
+            -S -t {threads} -P -i {input.graph} > {output.stats}
         """
 
 rule graph_figs:
-- 
GitLab


From 0015d8a94bcae7bcc74fa1eaff992d249046e613 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Mon, 11 Mar 2024 18:10:04 +0100
Subject: [PATCH 21/24] Update Snakefile

---
 Snakefile | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Snakefile b/Snakefile
index 9521960..fa9abf2 100644
--- a/Snakefile
+++ b/Snakefile
@@ -182,6 +182,7 @@ rule create_pggb_input_syri_fig:
             asm="$(basename $file .fa | cut -f1 -d"#").fa"
             mv $file "$(dirname $file)/$asm"
             asmList+=("$asm")
+        done
 
         bash scripts/getSyriFigs.sh \
             -a {params.apppath} \
-- 
GitLab


From 9b3af537596ec4bddaa23c10fb3e737d2e696a76 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Mon, 11 Mar 2024 18:25:43 +0100
Subject: [PATCH 22/24] Updated comments

---
 Snakefile   | 41 ++++++++++++++++++++++-------------------
 config.yaml |  5 +++++
 2 files changed, 27 insertions(+), 19 deletions(-)

diff --git a/Snakefile b/Snakefile
index fa9abf2..4c95d73 100644
--- a/Snakefile
+++ b/Snakefile
@@ -5,7 +5,7 @@ import numpy as np
 import gzip
 import re
 
-# Getting the list of haplotypes
+# Getting the list of haplotypes (SAMPLES)
 SAMPLES = np.unique([
     os.path.basename(f).split('.fa')[0] 
     for f in os.listdir("data/haplotypes/")
@@ -22,31 +22,33 @@ nHAP = len(SAMPLES)
 with gzip.open("data/haplotypes/"+config['reference'], "r") as handle:
     CHRLIST = [line.decode().split("#")[-1].split('\n')[0] for line in handle.readlines() if line.decode()[0] == ">"]
 
-def which_post_analysis():
-    ## Simple function to configure which parts of the workflow needs to be run
-    post_analysis_inputs = [     # Default post analysis steps
-        "output/pan1c.pggb."+config['name']+".core.stats.tsv",
-        expand("output/panacus.reports/{chromosome}.histgrowth.html", chromosome=CHRLIST)
+# Configuring steps to be done
+def which_analysis():
+    # Creating a list with default analysis steps (to prevent the function from returning an empty list)
+    analysis_inputs = [     
+        "output/pan1c.pggb."+config['name']+".core.stats.tsv", # core stats
+        expand("output/panacus.reports/{chromosome}.histgrowth.html", chromosome=CHRLIST), # panacus histgrowth
+        expand("output/asm.syri.figs/pan1c."+config['name']+".{haplotype}.syri.png", haplotype=SAMPLES_NOREF), # syri for haplotypes 
+        expand("output/chrGraphs.figs/{chromosome}.1Dviz.png", chromosome=CHRLIST), # visualizations from odgi on chromosome graphs
+        "output/pan1c.pggb."+config['name']+".chrGraph.stats.tsv" # chromosomes graph statistics
         ]
     
-    # Optionals post analysis steps
+    # Optionals analysis steps
     if config["get_PAV"] == "True":
-        post_analysis_inputs.append("output/pav.matrices")
+        analysis_inputs.append("output/pav.matrices")
+    if config["get_allASM_SyRI"] == "True":
+        analysis_inputs.append("output/asm.syri.figs/pan1c."+config['name']+".allAsm.syri.png")
 
-    return post_analysis_inputs
+    return analysis_inputs
 
 rule all:
     input:
-        "output/asm.syri.figs/pan1c."+config['name']+".allAsm.syri.png",
-        expand("output/asm.syri.figs/pan1c."+config['name']+".{haplotype}.syri.png", haplotype=SAMPLES_NOREF),
-        expand("output/chrGraphs.figs/{chromosome}.1Dviz.png", chromosome=CHRLIST),
-        "output/pan1c.pggb."+config['name']+".gfa",
-        "output/pan1c.pggb."+config['name']+".gfa.metadata",
-        "output/pan1c.pggb."+config['name']+".chrGraph.stats.tsv",
-        which_post_analysis()
+        "output/pan1c.pggb."+config['name']+".gfa", # Final graph (main output)
+        "output/pan1c.pggb."+config['name']+".gfa.metadata", # Metadata for the final (also in top of gfa files as # line)
+        which_analysis()
 
 rule samtools_index:
-    # Using samtools faidx to index compressed fasta
+    # Samtools faidx to index compressed fasta
     input:
         fa="{sample}.fa.gz"
     output:
@@ -61,6 +63,7 @@ rule samtools_index:
 
 """
 Pre-processing section
+Preparing pggb inputs
 """
 
 rule ragtag_scaffolding:
@@ -139,7 +142,7 @@ Core section
 """
 
 rule clustering:
-    # Read alignment file to create bins for each chromosome
+    # Read ragtagged fastas and split chromosome sequences into according bins
     input:
         expand('data/hap.ragtagged/{haplotype}.ragtagged.fa.gz', haplotype=SAMPLES)
     output:
@@ -196,7 +199,7 @@ rule create_pggb_input_syri_fig:
         """
 
 rule pggb_on_chr:
-    # Runs pggb on a specific chromosome
+    # Run pggb on a specific chromosome
     input:
         fa="data/chrInputs/{chromosome}.fa.gz",
         gzi="data/chrInputs/{chromosome}.fa.gz.gzi"
diff --git a/config.yaml b/config.yaml
index 7c8337f..41b8b05 100644
--- a/config.yaml
+++ b/config.yaml
@@ -4,4 +4,9 @@ app.path: '/home/amergez/work/apptainer'
 pggb.params: '-X --skip-viz'
 odgi.1Dviz.params: '-x 500 -b'
 odgi.pcov.params: '-x 500 -O'
+## Control over optional parts of the workflow
+# get_PAV is very long the more haplotypes there are (all vs all comparison)
 get_PAV: 'False'
+# get_allASM_SyRI controls if the SyRI figure with all haplotypes diplayed should be created (longer with #haplotypes)
+get_allASM_SyRI: 'True'
+
-- 
GitLab


From 301e5ab19dda8f1a6a4f111382f1071f7a7fa388 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <alexis.mergez@inrae.fr>
Date: Mon, 11 Mar 2024 18:27:37 +0100
Subject: [PATCH 23/24] Update config_CICD.yaml

---
 example/config_CICD.yaml | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/example/config_CICD.yaml b/example/config_CICD.yaml
index aeebbf6..8752f50 100644
--- a/example/config_CICD.yaml
+++ b/example/config_CICD.yaml
@@ -4,4 +4,8 @@ app.path: 'appimgs/'
 pggb.params: '-X --skip-viz'
 odgi.1Dviz.params: '-x 500 -b'
 odgi.pcov.params: '-x 500 -O'
+## Control over optional parts of the workflow
+# get_PAV is very long the more haplotypes there are (all vs all comparison)
 get_PAV: 'False'
+# get_allASM_SyRI controls if the SyRI figure with all haplotypes diplayed should be created (longer with #haplotypes)
+get_allASM_SyRI: 'True'
-- 
GitLab


From bcbd57053c88e95e56476809677e17931d191983 Mon Sep 17 00:00:00 2001
From: Alexis Mergez <amergez@miat-pret-5.toulouse.inrae.fr>
Date: Tue, 12 Mar 2024 17:39:52 +0100
Subject: [PATCH 24/24] Added comments

---
 scripts/chrInputStats.py | 45 +++++++++++++++++++++++++++++++++-------
 scripts/getSyriFigs.sh   | 25 ++++++++++++++--------
 scripts/getTags.py       | 31 ++++++++++++++++++++++-----
 3 files changed, 79 insertions(+), 22 deletions(-)

diff --git a/scripts/chrInputStats.py b/scripts/chrInputStats.py
index 9d23c75..7abe380 100644
--- a/scripts/chrInputStats.py
+++ b/scripts/chrInputStats.py
@@ -1,7 +1,7 @@
 """
 Input statistics script for Pan1c workflow
 
-Given a fasta input made for pggb, computes stats regarding the complexity of the sequences
+Given a fasta input made for pggb (data/chrInputs), it computes statistics. 
 
 @author: alexis.mergez@inrae.fr
 @version: 1.0 
@@ -47,48 +47,73 @@ arg_parser.add_argument(
     )
 args = arg_parser.parse_args()
 
-## Toolbox
-
 ## Main script
+"""
+Sequence dictionnary : 
+    - key : (Chromosome name (from filename), sequence id)
+    - value : sequence 
+"""
 seqDict = {}
 
 # Parsing fasta files
 for filename in args.fastafiles:
     
+    # Getting chromosome name from fasta filename
     chrName = os.path.basename(filename).split(".fa.gz")[0]
 
-    # Reading .fa.gz file and adding records to seqDict
+    # Reading bgzip fasta file and adding records to seqDict
     with gzip.open(filename, "rt") as handle:
+
+        # Parsing fasta sequences using SeqIO
         sequences = SeqIO.parse(handle, "fasta")
         
+        # Adding sequence to sequence dictionnary
         for record in sequences:
             seqDict[(chrName, record.id)] = record.seq
 
+# Reading available chromosomes for seqDict keys
 chromList = np.unique([x for x, y in seqDict.keys()])
 
+"""
+Data dictionnary :
+    - key : (pangenome name, chromosome id)
+    - value : statistics dictionnary
+"""
 aggregatedStats = {}
 
+# Iterating over available chromosomes
 for chrom in chromList:
+    """
+    Statistics dictionnary (temporary):
+        - key : statistic name
+        - value : statistic value
+    """
     _stats = {
         "input.#N": 0,
         "input.total.length": 0
         }
+    
+    # Storing length and computed N percentage for each sequences in dedicated lists
     _lengths, _Nper = [], []
+
+    # Iterating over sequences for the chrom fasta file in sequence dictionnary
     for (chrName, seqid), seq in seqDict.items():
+
         if chrName == chrom:
-            # Counting number of Ns
+            # Counting the number of Ns
             _stats["input.#N"] += seq.count("N")
-            # Counting total bases
+            # Counting the total number bases
             _stats["input.total.length"] += len(seq)
-            # Saving length and N percentage
+            # Saving the length and the N percentage
             _lengths.append(len(seq))
             _Nper.append((seq.count("N")/len(seq)))
 
+    # Adding stats
     _stats["input.mean.N%"] = np.mean(_Nper)*100
     _stats["input.mean.length"] = np.mean(_lengths)
     _stats["input.#sequences"] = len(_lengths)
 
-    #Computing L50
+    # Computing L50 for each sequence of chrom
     _lengths = sorted(_lengths, reverse = True)
     halfTotal = _stats["input.total.length"]/2
     cumulativeLength, L50 = 0, 0
@@ -101,13 +126,17 @@ for chrom in chromList:
     
     _stats["input.L50"] = L50
 
+    # Adding stats to according chromosome in the data dictionnary
     aggregatedStats[(args.panname,chrom)] = _stats
 
 if args.debug: print(aggregatedStats)    
 
+# Converting data dictionnary to pandas dataframe
 df = pd.DataFrame.from_dict(aggregatedStats, orient='index')
 df.reset_index(inplace=True)
 df.rename(columns={df.columns[0]: 'pangenome.name', df.columns[1]: 'chrom.id'}, inplace = True)
+
+# Saving to TSV
 df.to_csv(
     args.output,
     sep='\t',
diff --git a/scripts/getSyriFigs.sh b/scripts/getSyriFigs.sh
index dfeef3e..761955c 100755
--- a/scripts/getSyriFigs.sh
+++ b/scripts/getSyriFigs.sh
@@ -24,51 +24,58 @@ while getopts "r:q:a:t:d:o:" option; do
     esac
 done
 
+## Main script
+# Reading query argument and creating an array containing the path to query fasta files
 IFS=' ' read -r -a temp <<< "$qry"
-#echo "${qryList[@]}"
 
 asmList=("$ref")
-## Sorting the array to put the reference in first
+# Sorting the array to put the reference in first
 for item in "${temp[@]}"; do
     if [[ $item != "$ref" ]]; then
         asmList+=($item)
     fi
 done
 
-#asmList=("${ref}" "${qryList[@]}")
-echo "${asmList[@]}"
+# Array to store the created syri files 
 syriFileList=()
 
+# Iterating 2 by 2 with overlap, over the array of fasta files
 for ((i = 0; i < ${#asmList[@]} - 1; i++)); do
 
+    # Setting filepaths for later
     bamFile="$(basename ${asmList[i]} .fa.gz)_$(basename ${asmList[i + 1]} .fa.gz).bam"
     syriFile="$(basename ${asmList[i]} .fa.gz)_$(basename ${asmList[i + 1]} .fa.gz).syri.out"
+    
+    # Adding the output syri file to the array
     syriFileList+=($syriFile)
 
-    ##Â Minimap2 genome vs genome alignment
+    #Â Minimap2 genome vs genome alignment
     apptainer run --app minimap2 $appdir/PanGeTools.sif \
         -ax asm5 --eqx ${asmList[i]} ${asmList[i + 1]} -t $threads | \
         apptainer run --app samtools $appdir/PanGeTools.sif sort -O BAM -@ $threads - > $wrkdir/$bamFile
 
-    echo "${asmList[i]}"
-    ## Syri on previous alignment
+    # Syri on previous alignment
     apptainer run $appdir/pan1c-env.sif \
         syri -c $wrkdir/$bamFile -r ${asmList[i]} -q ${asmList[i + 1]} -k -F B \
         --nc $threads \
         --dir $wrkdir --prefix "$(basename ${asmList[i]} .fa.gz)_$(basename ${asmList[i + 1]} .fa.gz)."    
 done
 
-## Creating genomes.txt
+# Creating genomes.txt for plotsr. It is used to give simple names to fasta files in the final figure
+# Each line contains 2 columns : fasta filepath and its simpler name
 echo -e "#files\tname" > $wrkdir/genomes.txt
 for asm in "${asmList[@]}"; do
     echo -e "${asm}\t$(basename $asm .fa.gz | cut -d'.' -f1)" >> $wrkdir/genomes.txt
 done
 
-## Generating the list of syri files for the plotsr command
+# Generating the plotsr command
 command="--genomes $wrkdir/genomes.txt -o $wrkdir/$output -f 12 -H 16 -W 9 -d 600 "
+
+# Adding syri files to the command as each needs to be specified using "--sr" argument 
 for file in "${syriFileList[@]}"; do
     command+="--sr $wrkdir/$file "
 done
 
+# Running plotsr
 apptainer run $appdir/pan1c-env.sif plotsr \
     $command
diff --git a/scripts/getTags.py b/scripts/getTags.py
index 013f504..778c7ef 100644
--- a/scripts/getTags.py
+++ b/scripts/getTags.py
@@ -30,24 +30,38 @@ arg_parser.add_argument(
     )
 args = arg_parser.parse_args()
 
+## Main script
+"""
+Tags dictionnary :
+    - key : Main tool / apptainer image
+    - value : dictionnary of tags
+"""
 tags = {}
 
-## Pan1c-workflow section
+### Pan1c-workflow section
 tags["Pan1c"] = {}
+
+# Using git to get the version of the Pan1c workflow 
 _output = subprocess.run(
     ["git", "describe", "--tags"],
     capture_output=True,
     text=True,
 ).stdout[:-1]
+
+# Getting the pggb commands used in the workflow from the config file
+# ToDo : Get the command used from the pggb command logs !
 with open(args.config, 'r') as handle:
     pggbCmd = [line[:-1] for line in handle.readlines() if "pggb.params" in line][0].split(': ')[-1]
 
+# Adding tags
 tags["Pan1c"]["pan1c.version"] = _output
 tags["Pan1c"]["pan1c.home"] = "https://forgemia.inra.fr/alexis.mergez/pan1c"
 tags["Pan1c"]["pan1c.pggb.args"] = pggbCmd
 
-## PanGeTools section
+### PanGeTools section
 tags["pangetools"] = {}
+
+# Reading the apps versions from the apptainer tags
 _output = subprocess.run(
     ["apptainer", "inspect", "-j", f"{args.appdir}/PanGeTools.sif"],
     capture_output=True, 
@@ -58,13 +72,15 @@ labels = _output['data']['attributes']['labels']
 tags["pangetools"]["image.version"] = labels['Version']
 tags["pangetools"]["image.home"] = labels['about.home']
 
+# Adding app versions to the tag dictionnary
 for key in labels.keys():
     if ".Version" in key:
         tags["pangetools"][key.lower()] = labels[key]
 
-## PGGB image section
+### PGGB image section
 tags["pggb"] = {}
 
+# Reading the apps versions from the apptainer tags
 _output = subprocess.run(
     ["apptainer", "inspect", "-j", f"{args.appdir}/pggb.sif"],
     capture_output=True, 
@@ -75,13 +91,15 @@ labels = _output['data']['attributes']['labels']
 tags["pggb"]["image.version"] = labels['Version']
 tags["pggb"]["image.home"] = labels['about.home']
 
+# Adding app versions to the tag dictionnary
 for key in labels.keys():
     if ".Version" in key:
         tags["pggb"][key.lower()] = labels[key]
 
-## Pan1c-Env section
+### Pan1c-Env section
 tags["pan1c-env"] = {}
 
+# Reading the apps versions from the apptainer tags
 _output = subprocess.run(
     ["apptainer", "inspect", "-j", f"{args.appdir}/pan1c-env.sif"],
     capture_output=True, 
@@ -92,6 +110,7 @@ labels = _output['data']['attributes']['labels']
 tags["pan1c-env"]["image.version"] = labels['Version']
 tags["pan1c-env"]["image.home"] = labels['about.home']
 
+# Adding app versions to the tag dictionnary
 for key in labels.keys():
     if ".Version" in key:
         tags["pan1c-env"][key.lower()] = labels[key]
@@ -99,6 +118,7 @@ for key in labels.keys():
 ## Pan1c-Box section
 tags["pan1c-box"] = {}
 
+# Reading the apps versions from the apptainer tags
 _output = subprocess.run(
     ["apptainer", "inspect", "-j", f"{args.appdir}/pan1c-box.sif"],
     capture_output=True, 
@@ -109,11 +129,12 @@ labels = _output['data']['attributes']['labels']
 tags["pan1c-box"]["image.version"] = labels['Version']
 tags["pan1c-box"]["image.home"] = labels['about.home']
 
+# Adding app versions to the tag dictionnary
 for key in labels.keys():
     if ".Version" in key:
         tags["pan1c-box"][key.lower()] = labels[key]
 
-## Exporting tags
+## Exporting tags to stdout
 print("#\tThis graph have been created using the Pan1c workflow (https://forgemia.inra.fr/alexis.mergez/pan1c)\n#")
 print("#\tTool versions and commands\n#")
 for first_elem in tags.keys():
-- 
GitLab