Merge branch '8-includeIsbpsMapping' into 'master'

NEW : ISBPs extraction and mapping added in the begining of the pipeline See merge request !10

Merge branch '8-includeIsbpsMapping' into 'master'
NEW : ISBPs extraction and mapping added in the begining of the pipeline See merge request !10
76bf22e3 · Helene Rimbert · 70c7a358 · 0fa68d60 · 76bf22e3 · 76bf22e3
Commit 76bf22e3 authored 1 year ago by Helene Rimbert
--- a/README.md
+++ b/README.md
@@ -40,6 +40,7 @@ All the dependancies installed in the conda env are listed below.
 * GMAP: 2018-05-11
 * NCBI-blast (BLAST+): 2.6
 * Samtools: 1.9
+* BWA: 0.7

 ## Prepare and run the pipeline

@@ -61,15 +62,8 @@ The configuration file [config.yaml](config.yaml) will contain all the input fil

 The pipeline uses markers/ISBPs as anchores to target a restricted part of the target genome on which our genes are suspected to be located.

-In the first place, it is required to map such markers (ISBPs in our cases) with bwa on your target genome:
-
-```bash
-$ bwa mem IWGSC_RefSeqV2_bwaindex ISBPs.fasta |samtools view -bS - |samtools sort -o ISBPS_on_refseqv2.bam
-```
-
 | Parameter in `config.yaml` | format |description |Example |  
 |----------------------------|--------|------------|--------|  
-|**isbpBam**| BAM|BAM file of the mapping of markers/ISBPs on the target genome|isbpBam: 'ISBPS_on_refseqv2.bam'|
 |**isbpBed**| BED|Initial coordinates of the markers/ISBPs on the query genome|isbpBed: 'data/ISBP_refseqv1.bed'|
 |**mapq**| \[INT\]|Minimum mapping quality of the marker/ISBP to be kept in the anchoring process|mapq: 30|
 |**mismatches**| \[INT\]|Max missmatches allowed for a marker to be kept in the analysis|mismatches: 0|
@@ -80,9 +74,10 @@ Here, we only need the fasta of the new genome assembly

 | Parameter in `config.yaml` | format |description |Example |  
 |----------------------------|--------|------------|--------|  
-|**targetFasta**|FASTA|fasta file of the target genome assembly on which we transfert the annotation|targetFasta: 'data/CS_pesudo_v2.1.fa'|
-|**targetGmapIndex**|PATH|Name of the GMAP index directory. This will be used with the `-d` option of `gmapl`|targetGmapIndex: 'ensembl_Triticum_aestivum_julius_2022-9-16'|
-|**targetGmapIndexPath**|PATH|Full path to the directory in which the GMAPindex is found. This will be used with the `-D` option of `gmapl`|targetGmapIndexPath: '/home/data/triticum_aestivum/julius/gmapdb/all/'|
+|**targetFasta**|FASTA|fasta file of the target genome assembly on which we transfert the annotation|targetFasta: 'data/CS_pesudo_v2.1.fa'|  
+|**targetGmapIndex**|PATH|Name of the GMAP index directory. This will be used with the `-d` option of `gmapl`|targetGmapIndex: 'ensembl_Triticum_aestivum_julius_2022-9-16'|  
+|**targetBwaIdx**|BWA index|Prefix for the BWA index files|targetBwaIdx: '/home/herimbert/gdec/shared/triticum_aestivum/arinalrfor/current/bwa/all'|  
+|**targetGmapIndexPath**|PATH|Full path to the directory in which the GMAPindex is found. This will be used with the `-D` option of `gmapl`|targetGmapIndexPath: '/home/data/triticum_aestivum/julius/gmapdb/all/'|  

 * Output parameters/settings


--- a/cluster-hpc2.json
+++ b/cluster-hpc2.json
@@ -9,6 +9,12 @@
 		"error" : "slurm-%x-%J.log",
 		"output" : "slurm-%x-%J.log"
 	},
+	"isbpBwa":
+	{	
+		"time" : "1-00:00:00",
+		"c" :"{threads}",
+		"mem" : "32G"
+	},
 	"mapHomologousRegions" :
 	{
 		"time" : "1-00:00:00",

--- a/config.yaml
+++ b/config.yaml
@@ -22,12 +22,18 @@ targetFasta: 'data/Triticum_aestivum_arinalrfor.PGSBv2.1.dna.toplevel.fa'
 targetGmapIndex: 'ensembl_Triticum_aestivum_arinalrfor_2023-2-17'
 #GMAP index: path to the gmapindex directory, for -D option
 targetGmapIndexPath: '/home/herimbert/gdec/shared/triticum_aestivum/arinalrfor/current/gmapdb/all/'
+#BWA index prefix
+targetBwaIdx: '/home/herimbert/gdec/shared/triticum_aestivum/arinalrfor/current/bwa/all'

 ##### ISBP/markers related config and parameters
 # BAM file of markers/ISBPs mapped on the target genome
-isbpBam: '/home/masirvent/wheat10plus-pangenome/data/mappingISBP/session2/arina/arina_CS_ISBP.bam'
+# isbpBam: '/home/masirvent/wheat10plus-pangenome/data/mappingISBP/session2/arina/arina_CS_ISBP.bam'
 # BED file of coordinates on the query genome (REFSEQ v2.1)
 isbpBed: 'data/Tae.Chinese_Spring.refSeqv2.1.ISBPs.bed'
+# BWA threads for mapping
+bwaThreads: 16
+# FLAG : F flag for samtools
+flag_F: 3844
 # minimum mapping quality of markers on the target genome
 mapq: 30
 # max mismatches per ISBP/marker
@@ -38,7 +44,7 @@ results: 'results'
 finalPrefix: 'arina_magatt'
 # this file contains two columns: the first is the chromosome name as it appears in the genome.fasta of the new reference,
 # and the second the chromosome name as it will appear in the new gene Names
-chromMapID: '/home/masirvent/wheat10plus-pangenome/data/liftoff/arina/chromstab.txt'
+chromMapID: 'data/chromstab.txt'

 ##### Nomenclature for final gene IDs
 # used in rule renameGeneIds (rules/geneAnchoring.smk)

--- a/envs/environment.yml
+++ b/envs/environment.yml
@@ -16,3 +16,4 @@ dependencies:
 - gffread
 - gmap=2020.06.30-0
 - blast
+ - bwa
--- a/rules/bedtoolsClosest.smk
+++ b/rules/bedtoolsClosest.smk
 rule selectMappedISBP:
 	message: " Select only mapped ISBPS on new refseq"
-	conda: "envs/environment.yml"
+	conda: "magatt"
 	input: mapped=config['results']+'/1.filteredISBPs/{chrom}/sorted.bed',
 		 original=config['isbpBed']
 	output: config['results']+'/2.mappedISBPs/{chrom}/coordsOnQuery.bed'
@@ -13,7 +13,7 @@ rule selectMappedISBP:

 rule keepMappedOnSameChrom:
 	message: " Select Mapped ISBPs on same chromosome (or on unknown chromosome)"
-	conda: "envs/environment.yml"
+	conda: "magatt"
 	input: isbpOnQuery=config['results']+'/2.mappedISBPs/{chrom}/coordsOnQuery.bed', 
 		isbpOnTarget=config['results']+'/1.filteredISBPs/{chrom}/sorted.bed',
 	output: config['results']+'/2.mappedISBPs/{chrom}/OnSameChrom.bed'
@@ -25,7 +25,7 @@ rule keepMappedOnSameChrom:

 rule upstreamClosest:
 	message: " Collect closest marker upstream of genes"
-	conda: "envs/environment.yml"
+	conda: "magatt"
 	input: annot=config['results']+"/1.features/{chrom}.bed", 
 		markers=config['results']+'/2.mappedISBPs/{chrom}/OnSameChrom.bed'
 	output: config['results']+'/2.closestbed/{chrom}/upstream.txt'
@@ -37,7 +37,7 @@ rule upstreamClosest:

 rule downstreamClosest:
 	message: " Collect Downstream marker downstream of genes"
-	conda: "envs/environment.yml"
+	conda: "magatt"
 	input: annot=config['results']+"/1.features/{chrom}.bed", 
 			markers=config['results']+'/2.mappedISBPs/{chrom}/OnSameChrom.bed'
 	output: config['results']+'/2.closestbed/{chrom}/downstream.txt'

--- a/rules/checkCds.smk
+++ b/rules/checkCds.smk
 rule validateCdsHC:
 	message: " check CDS integrity for HC genes"
-	conda: "envs/environment.yml"
+	conda: "magatt"
 	input: config['finalPrefix']+'_HC.cds.fasta'
 	output: fasta=config['finalPrefix']+'_HC.cds.valid.fasta',
 		csv=config['finalPrefix']+'_HC.cds.valid.explained.txt'
@@ -11,7 +11,7 @@ rule validateCdsHC:

 rule validateCdsLC:
 	message: " check CDS integrity for LC genes"
-	conda: "envs/environment.yml"
+	conda: "magatt"
 	input: config['finalPrefix']+'_LC.cds.fasta'
 	output: fasta=config['finalPrefix']+'_LC.cds.valid.fasta',
 		csv=config['finalPrefix']+'_LC.cds.valid.explained.txt'

--- a/rules/geneAnchoring.smk
+++ b/rules/geneAnchoring.smk
 rule mapHomologousRegions:
 	message: " mapping homologous regions of both references using ISBPs markers as anchors for chromosome {wildcards.chrom}"
-	conda: "envs/environment.yml"
+	conda: "magatt"
 	input:
 		#closestMarkers=config['results']+'/2.mergedClosestMarkers.txt',
 		marker5prime=config['results']+'/2.closestbed/{chrom}/upstream.txt',
@@ -23,7 +23,7 @@ rule mapHomologousRegions:

 rule recalcBlatMapped:
 	message: " Recalc the coordinates of genes mapped with the Blat pipeline for chromosome {wildcards.chrom}"
-	conda: "envs/environment.yml"
+	conda: "magatt"
 	input:
 		allBlat=config['results']+'/3.mapping/{chrom}/allBlat.csv',
 		summary=config['results']+'/3.mapping/{chrom}/mappingSummary.csv',
@@ -43,7 +43,7 @@ rule recalcBlatMapped:

 rule gtCleanBlatGff:
 	message: " Clean the gff file recalculated based on Blat fine mapping for chromosome {wildcards.chrom}"
-	conda: "envs/environment.yml"
+	conda: "magatt"
 	input: gff=config['results']+'/4.recalcBlat/{chrom}/RecalcAnnotOnTarget.gff3',
 		mapping=config['results']+'/3.mapping/{chrom}/temp'
 	output: config['results']+'/4.recalcBlat/{chrom}/RecalcAnnotOnTarget-clean.gff3'
@@ -55,7 +55,7 @@ rule gtCleanBlatGff:

 rule gmapRescue:
 	message: " Rescue anchoring of failed genes with gmap for chrom {wildcards.chrom}"
-	conda: "envs/environment.yml"
+	conda: "magatt"
 	input: blat=config['results']+'/3.mapping/{chrom}/allBlat.csv',
 		wholeGenomeFasta=config['targetFasta'],
 		mapping=config['results']+'/3.mapping/{chrom}/temp'
@@ -77,7 +77,7 @@ rule gmapRescue:

 rule recalcGmapRescue:
 	message: "Recalc the coordinates of the GMAP on target GFF3 files for chromosome {wildcards.chrom}"
-	conda: "envs/environment.yml"
+	conda: "magatt"
 	input: gff=config['results']+'/4.gmapRescue/{chrom}.target.gff3',
 		mapping=config['results']+'/3.mapping/{chrom}/temp'
 	output: config['results']+'/4.recalcGmap/{chrom}/recalc.gff3'
@@ -91,7 +91,7 @@ rule recalcGmapRescue:

 rule saveGmapWG:
 	message: "Save the GMAP results of genes mapped on Whole Genome"
-	conda: "envs/environment.yml"
+	conda: "magatt"
 	input: gff=config['results']+'/4.gmapRescue/{chrom}.wholeGenome.gff3',
 		map=config['chromMapID'],
 		mapping=config['results']+'/3.mapping/{chrom}/temp'
@@ -107,7 +107,7 @@ rule saveGmapWG:

 rule mergeFinalGff3:
 	message: " Merge Final GFF3 files: blat.gff3, rescue.gff3 and wholeGenome.gff3"
-	conda: "envs/environment.yml"
+	conda: "magatt"
 	input: blat=config['results']+'/4.recalcBlat/{chrom}/RecalcAnnotOnTarget-clean.gff3',
 		rescue=config['results']+'/4.recalcGmap/{chrom}/recalc.gff3',
 		wg=config['results']+'/4.gmapWholeGenome/{chrom}.wholeGenome.gff3',
@@ -120,7 +120,7 @@ rule mergeFinalGff3:
 		"""
 rule checkMissing:
 	message: " Check missing transfered genes"
-	conda: "envs/environment.yml"
+	conda: "magatt"
 	input: gff=config['results']+'/5.FINAL/{chrom}/annotation.gff3',
 		ref=config['results']+"/1.features/{chrom}.bed"
 	output: config['results']+'/5.FINAL/{chrom}/missing.txt'
@@ -132,7 +132,7 @@ rule checkMissing:

 rule concatAllChromResults:
 	message: " concat all per chromosome results"
-	conda: "envs/environment.yml"
+	conda: "magatt"
 	input: annot=expand(config['results']+'/5.FINAL/{chrom}/annotation.gff3',chrom=config['chromosomes']),
 		differentChrom=expand(config['results']+'/4.gmapWholeGenome/{chrom}.wholeGenome_differentChrom.txt',chrom=config['chromosomes']),
 		missing=expand(config['results']+'/5.FINAL/{chrom}/missing.txt', chrom=config['chromosomes']),
@@ -150,7 +150,7 @@ rule concatAllChromResults:

 rule renameGeneIds:
 	message: " set final gene IDs for the new annotation"
-	conda: "envs/environment.yml"
+	conda: "magatt"
 	input: gff=config['finalPrefix']+'tmp.gff3',
 		map=config['chromMapID']
 	output: gffhc=config['finalPrefix']+'_HC.gff3',
@@ -166,7 +166,7 @@ rule renameGeneIds:

 rule generateFastaSequencesHC:
 	message: "generate fasta sequences using gffread for HC genes"
-	conda: "envs/environment.yml"
+	conda: "magatt"
 	input: gff=config['finalPrefix']+'_HC.gff3',
 		fastaref=config['targetFasta']
 	output: mrna=config['finalPrefix']+'_HC.transcripts.fasta',
@@ -179,7 +179,7 @@ rule generateFastaSequencesHC:
 		"""
 rule generateFastaSequencesLC:
 	message: "generate fasta sequences using gffread for LC genes"
-	conda: "envs/environment.yml"
+	conda: "magatt"
 	input:
 		gff=config['finalPrefix']+'_LC.gff3',
 		fastaref=config['targetFasta']
@@ -194,7 +194,7 @@ rule generateFastaSequencesLC:

 rule concatblatSummary:
 	message: " Concat all Blat summary"
-	conda: "envs/environment.yml"
+	conda: "magatt"
 	input:blat=expand(config['results']+'/3.mapping/{chrom}/allBlat.csv', chrom=config['chromosomes']),
 		mapping=expand(config['results']+'/3.mapping/{chrom}/temp', chrom=config['chromosomes'])
 	output: config['finalPrefix']+'_blatSummary.csv'
@@ -205,7 +205,7 @@ rule concatblatSummary:

 rule concatAnchoringSummary:
 	message: " Concat all Anchoring summary"
-	conda: "envs/environment.yml"
+	conda: "magatt"
 	input:anchoring=expand(config['results']+'/3.mapping/{chrom}/mappingSummary.csv', chrom=config['chromosomes']),
 		mapping=expand(config['results']+'/3.mapping/{chrom}/temp', chrom=config['chromosomes'])
 	output: config['finalPrefix']+'_anchoringSummary.csv'

--- a/rules/preprocessGenomes.smk
+++ b/rules/preprocessGenomes.smk
 rule grepGffFeature:
    message: " Collect selected features from GFF file"
-    conda: "envs/environment.yml"
+    conda: "magatt"
    input: config['annotationQuery']
    params: config['featureType']
    output: temp(config['results']+"/1.features.bed")
@@ -9,7 +9,7 @@ rule grepGffFeature:

 rule splitGffPerChrom:
    message: "Split Gff Features per chromosome: current is {wildcards.chrom}"
-    conda: "envs/environment.yml"
+    conda: "magatt"
    input: config['results']+"/1.features.bed"
    output: temp(config['results']+"/1.features/{chrom}.bed")
    log: config['results']+"/1.features/{chrom}.fgrep.log"
@@ -18,21 +18,21 @@ rule splitGffPerChrom:

 rule indexQuery:
 	message: " Indexing Query fasta file using samtools faidx"
-	conda: "envs/environment.yml"
+	conda: "magatt"
 	input: config['queryFasta']
 	output: config['queryFasta']+'.fai'
 	shell: "samtools faidx {input}"

 rule indexTarget:
 	message: " Indexing Target fasta file using samtools faidx"
-	conda: "envs/environment.yml"
+	conda: "magatt"
 	input: config['targetFasta']
 	output: config['targetFasta']+'.fai'
 	shell: "samtools faidx {input}"

 #rule gmapIndexTarget:
 #	message: " Create Gmap Index for rescue"
-#	conda: "envs/environment.yml"
+#	conda: "magatt"
 #	input: config['targetFasta']
 #	output: directory(config['results']+"/target_gmapindex")
 #	params: indexname="target_gmapindex", indexPath=config['results']

--- a/rules/preprocessISBP.smk
+++ b/rules/preprocessISBP.smk
+rule isbpFastaFromBed:
+	message: 'Extract ISBPs sequence from reference genome'
+	conda: 'magatt'
+	input: bed=config['isbpBed'], fasta=config['queryFasta']
+	output: config['results']+'/1.ISBPs.fasta'
+	log: config['results']+'/1.ISBPS_fastafrombed.log'
+	shell: "bedtools getfasta -name -fi {input.fasta} -bed {input.bed} -fo {output}"
+
+rule isbpBwa:
+	message: 'Map ISBPs on target genome'
+	conda: 'magatt'
+	input: fasta=config['results']+'/1.ISBPs.fasta'
+	output: config['results']+'/1.MappedISBPs.bam'
+	params: mapq=config['mapq'],F=config['flag_F'], bwaidx=config['targetBwaIdx']
+	threads: config['bwaThreads']
+	log: config['results']+'/1.MappedISBPs.log'
+	shell: "bwa mem -t {threads} {params.bwaidx} {input.fasta} | samtools view -bS -F {params.F} -q {params.mapq} - | samtools sort -o {output}"
+
 rule filterBam:
 	message: "Filtering BAM file of ISBPs"
-	conda: "envs/environment.yml"
-	input: config['isbpBam']
+	conda: "magatt"
+	input: config['results']+'/1.MappedISBPs.bam'
 	output: config['results']+'/1.filteredISBPs.bam'
 	params: mapq=config['mapq'], mismatches=config['mismatches']
 	log: config['results']+'/1.filterBam.log'
@@ -9,24 +27,24 @@ rule filterBam:

 rule bam2bed:
 	message: "Convert Filtered BAM file into BED"
-	conda: "envs/environment.yml"
+	conda: "magatt"
 	input: config['results']+"/1.filteredISBPs.bam"
 	output: config['results']+"/1.filteredISBPs/{chrom}/sorted.bed"
 	log: config['results']+"/1.filteredISBPs/{chrom}/sorted.log"
-        params: '{chrom}'
+	params: '{chrom}'
 	shell: "bamToBed -i {input} |fgrep -i {params}|cut -d ':' -f 1|sort -k1,1 -k2,2n  1> {output} 2> {log}"


 #rule dumpISBPsID:
 #	message: "Dump ISBPs IDs"
-#	conda: "envs/environment.yml"
+#	conda: "magatt"
 #	input: config['results']+"/1.filteredISBPs.bed"
 #	output: config['results']+"/1.filteredISBPs.ids"
 #	shell: " cut -f 4 {input} > {output}"

 #rule splitISBP:
 #	message: "Split isbps per chromosome"
-#	conda: "envs/environment.yml"
+#	conda: "magatt"
 #	input: config['results']+"/1.filteredISBPs.bed"
 #	output: config['results']+"/1.filteredISBPs/{chrom}/sorted.bed"
 #	log: config['results']+"/1.filteredISBPs/{chrom}/sorted.log"