RVC_WORKDIR = cfg.RVC.renameLocalDir()
RVC_index_input,RVC_graph_file,RVC_index_output = cfg.RVC.getModuleIndexFiles("rvc-pipeline",RVC_WORKDIR)

#########
# Helper Functions
# #######
def getKnownVCFs(dbSNP_only = False):
    if dbSNP_only:
        return cfg.config_dict["rnaVariantCalling"]["dbSNP"] 
    else:
        highQualityVCFs = cfg.config_dict["rnaVariantCalling"]["dbSNP"] + ";"
        for vcf_file in cfg.config_dict["rnaVariantCalling"]["highQualityVCFs"]:
            highQualityVCFs += vcf_file + ";"
        return highQualityVCFs

def getHaploCallerArgs():
    return cfg.config_dict["rnaVariantCalling"]["hcArgs"]

def getRepeatMask(sortedName=False):
    if sortedName:
        ext = cfg.config_dict["rnaVariantCalling"]["repeat_mask"].strip().split('.')[-1]
        return ".".join(cfg.config_dict["rnaVariantCalling"]["repeat_mask"].strip().split('.')[:-1]) + "_sorted." + ext
    else:
        return cfg.config_dict["rnaVariantCalling"]["repeat_mask"]

def getMinAlt():
    return str(cfg.config_dict["rnaVariantCalling"]["minAlt"])

def createSingleVCF():
    if cfg.RVC.get("createSingleVCF"):
        return expand(os.path.join(
            cfg.processedResultsDir,
            "rnaVariantCalling/sample_vcfs", "{sample}",
            "{sample}.vcf.gz"),
            sample = cfg.RVC.batchIDs)
    else:
        return []
        
#################################
#make sure all of the different {dataset} and {sample} are processed. As defined by the sample annotation table DROP_GROUP
rule rnaVariantCalling:
    input:  RVC_index_input,
            RVC_graph_file,
            createSingleVCF(),
            expand(os.path.join(str(cfg.processedDataDir) + "/rnaVariantCalling/{dataset}_done.txt"),
                   dataset = cfg.RVC.groups),
            expand(os.path.join(
                cfg.processedResultsDir,
                "rnaVariantCalling/batch_vcfs", "{dataset}",
                "{dataset}_{annotation}.annotated.vcf.gz"),
                annotation = cfg.get("geneAnnotation"), dataset = cfg.RVC.groups)
    output: RVC_index_output
    threads: 1
    run:
        if cfg.RVC.run:
            ci(str(RVC_WORKDIR), 'rvc-pipeline')


rule rnaVariantCalling_dependency:
    output: RVC_graph_file
    threads: 1
    shell:
        """
        snakemake --rulegraph rnaVariantCalling | \
        sed -ne '/digraph snakemake_dag/,/}}/p' | \
        dot -Tsvg -Grankdir=TB > {output}
        """


#Define the {sample} and {dataset} variable
#create the empty output file of the form: {dataset}_done.txt
rule allVariants:
    input:
        createSingleVCF(),
        batch_vcfs = expand(os.path.join(
            cfg.processedDataDir,
            "rnaVariantCalling/out/batch_vcfs",
            "{dataset}",
            "{dataset}.processed.vcf.gz"),
            dataset = cfg.RVC.groups)
    output:
        os.path.join(str(cfg.processedDataDir) + "/rnaVariantCalling/{dataset}_done.txt")
    threads: 1
    shell:
        """
        touch {output}
        """

#Use bcftools to split the multi-sample VCF file into a VCF file for the corresponding sample. Normalize the variants to remove artifacts
rule split_multiVCF:
    input:
        lambda wildcards: os.path.join(
            cfg.processedDataDir,
            "rnaVariantCalling/out/batch_vcfs",
            cfg.RVC.batchIDs[wildcards.sample],
            cfg.RVC.batchIDs[wildcards.sample] + ".processed.vcf.gz")
    threads: 1
    output:
        vcf = os.path.join(
            cfg.processedResultsDir,
            "rnaVariantCalling/sample_vcfs",
            "{sample}",
            "{sample}.vcf.gz"),
        vcf_tabix = os.path.join(
            cfg.processedResultsDir,
            "rnaVariantCalling/sample_vcfs",
            "{sample}",
            "{sample}.vcf.gz.tbi")
    params:
        sample = "{sample}",
        ref = lambda wildcards: cfg.RVC.getGenomePath(wildcards.sample),
    log:
        str(cfg.processedDataDir) + "/rnaVariantCalling/logs/sample_vcfs/{sample}.single_vcf.log"
    shell:
        """
        echo "reading multi-sample vcf into single sample vcf"

        bcftools view -c1 -Ov -s {params.sample} {input} | \
        grep -w -v "*"|grep -w -v "0/0" |bgzip -c > {output.vcf}
        tabix -f -p vcf {output.vcf}

        """

#sort the Repeat Mask index
rule sortIndexRepeatMask:
    input:
        repeat_mask = getRepeatMask()
    output:
        sorted_repeat_mask = getRepeatMask(sortedName = True),
        sorted_index = getRepeatMask(sortedName = True) + ".idx"
    shell:
        """
        sort -k1,2 -V {input.repeat_mask} > {output.sorted_repeat_mask}
        gatk --java-options "-Djava.io.tmpdir={resources.tmpdir}" IndexFeatureFile \
        -I {output.sorted_repeat_mask}
        """

rule simpleAnnotateVCF:
    input:
        vcf = os.path.join(
            cfg.processedDataDir,
            "rnaVariantCalling/out/batch_vcfs",
            "{dataset}",
            "{dataset}.processed.vcf.gz"),
        gtf = lambda wildcards: cfg.genome.getGeneAnnotationFile(wildcards.annotation),
        script = str(RVC_WORKDIR) + "/GATK_BASH/simpleAnnotation.sh"
    output:
        vcf = os.path.join(
            cfg.processedResultsDir,
            "rnaVariantCalling/batch_vcfs",
            "{dataset}",
            "{dataset}_{annotation}.annotated.vcf.gz"),
        vcf_tabix = os.path.join(
            cfg.processedResultsDir,
            "rnaVariantCalling/batch_vcfs",
            "{dataset}",
            "{dataset}_{annotation}.annotated.vcf.gz.tbi"),
    shell:
        """
        {input.script} {input.vcf} {input.gtf} {output.vcf}
        """

rule maskMultiVCF:
    input:
        vcf = os.path.join(
            cfg.processedDataDir,
            "rnaVariantCalling/out/batch_vcfs",
            "{dataset}",
            "{dataset}_all_samples.genotyped.filtered_clean.vcf.gz"),
        vcf_tbi = os.path.join(
            cfg.processedDataDir,
            "rnaVariantCalling/out/batch_vcfs",
            "{dataset}",
            "{dataset}_all_samples.genotyped.filtered_clean.vcf.gz.tbi"),
        repeat_mask = getRepeatMask(sortedName = True),
        batch_params = os.path.join(cfg.processedDataDir, "rnaVariantCalling/params/batches" ,
                                    "{dataset}_batchParams.csv"),
        script = str(RVC_WORKDIR) + "/GATK_BASH/maskSingleVCF.sh"
    output:
        vcf = temp(os.path.join(
            cfg.processedDataDir,
            "rnaVariantCalling/out/batch_vcfs",
            "{dataset}",
            "{dataset}.processed.vcf.gz")),
        vcf_tbi = temp(os.path.join(
            cfg.processedDataDir,
            "rnaVariantCalling/out/batch_vcfs",
            "{dataset}",
            "{dataset}.processed.vcf.gz.tbi"))
    params:
        ref = lambda wildcards: cfg.RVC.getGenomePath(wildcards.dataset),
    log:
        str(cfg.processedDataDir) + "/rnaVariantCalling/logs/all_haplocaller/{dataset}_mask.log"
    shell:
        """
        {input.script} {input.vcf} {input.repeat_mask} {params.ref} {resources.tmpdir} {log} {output.vcf}
        """

# Use bcftools to split and left normalize variants. 
# Variants labeled AAAG>AC can be shortened to AAG>C
# split VCF with multiple variants on a single line into a VCF with a single variant per line. G>A,C into 2 lines G>A and G>C
rule leftNormalVCF:
    input:
        vcf = os.path.join(
            cfg.processedDataDir,
            "rnaVariantCalling/out/batch_vcfs",
            "{dataset}",
            "{dataset}_all_samples.genotyped.filtered.vcf.gz"),
        tbi = os.path.join(
            cfg.processedDataDir,
            "rnaVariantCalling/out/batch_vcfs",
            "{dataset}",
            "{dataset}_all_samples.genotyped.filtered.vcf.gz.tbi")
    output:
        vcf = temp(os.path.join(
            cfg.processedDataDir,
            "rnaVariantCalling/out/batch_vcfs",
            "{dataset}",
            "{dataset}_all_samples.genotyped.filtered_clean.vcf.gz")),
        tbi = temp(os.path.join(
            cfg.processedDataDir,
            "rnaVariantCalling/out/batch_vcfs",
            "{dataset}",
            "{dataset}_all_samples.genotyped.filtered_clean.vcf.gz.tbi"))
    params:
        ref = lambda wildcards: cfg.RVC.getGenomePath(wildcards.dataset)
    log:
        str(cfg.processedDataDir) + "/rnaVariantCalling/logs/all_haplocaller/{dataset}_split_norm.log"
    shell:
        """
        bcftools norm -m-both {input.vcf} | grep -v -w "*" |  \
        bcftools norm -f {params.ref} | bgzip -c > {output.vcf}
        tabix -f -p vcf {output.vcf}
        """

#Use GATK to filter variants based on the quality scores and variant frequency based on the GATK-best practices
rule filterVCF:
    input:
        vcf = os.path.join(
            cfg.processedDataDir,
            "rnaVariantCalling/out/batch_vcfs",
            "{dataset}",
            "{dataset}_all_samples.genotyped.raw.vcf.gz"),
        tbi = os.path.join(
            cfg.processedDataDir,
            "rnaVariantCalling/out/batch_vcfs",
            "{dataset}",
            "{dataset}_all_samples.genotyped.raw.vcf.gz.tbi"),
    output:
        temp_vcf = temp(os.path.join(
            cfg.processedDataDir,
            "rnaVariantCalling/out/batch_vcfs",
            "{dataset}",
            "TEMP.{dataset}_all_samples.genotyped.filtered.vcf.gz")),
        temp_vcf_tbi = temp(os.path.join(
            cfg.processedDataDir,
            "rnaVariantCalling/out/batch_vcfs",
            "{dataset}",
            "TEMP.{dataset}_all_samples.genotyped.filtered.vcf.gz.tbi")),
        filt_vcf = temp(os.path.join(
            cfg.processedDataDir,
            "rnaVariantCalling/out/batch_vcfs",
            "{dataset}",
            "{dataset}_all_samples.genotyped.filtered.vcf.gz"
            )),
        filt_vcf_tbi = temp(os.path.join(
            cfg.processedDataDir,
            "rnaVariantCalling/out/batch_vcfs",
            "{dataset}",
            "{dataset}_all_samples.genotyped.filtered.vcf.gz.tbi"))
    params:
        ref = lambda wildcards: cfg.RVC.getGenomePath(wildcards.dataset),
        minAlt = getMinAlt()
    log:
        str(cfg.processedDataDir) + "/rnaVariantCalling/logs/all_haplocaller/{dataset}_filterVariants.log"
    shell:
        """
        # filter any sample for the 2nd AD field (ALT), if any pass filter mark PASS else mark minALT
        bcftools filter -i 'FORMAT/AD[:1] >= {params.minAlt}' {input.vcf}  -s "minALT" -o {output.temp_vcf} -O b
        tabix -f -p vcf  {output.temp_vcf}

        gatk --java-options "-Djava.io.tmpdir={resources.tmpdir}" VariantFiltration \
        -R {params.ref} -V {output.temp_vcf} \
        -window 35 -cluster 3 --filter-name FS --filter-expression "FS > 30.0" \
        --filter-name QD --filter-expression "QD < 2.0" -O {output.filt_vcf} 2> {log}

        """


#Using GATK GenotypeGVCFs to make the variant calls from the combined g.vcf files
rule genotypeGVCFs:
    input:
        gvcf = os.path.join(
            cfg.processedDataDir,
            "rnaVariantCalling/out/batch_vcfs",
            "{dataset}",
            "{dataset}_all_samples.g.vcf.gz"),

        gvcf_tbi = os.path.join(
            cfg.processedDataDir,
            "rnaVariantCalling/out/batch_vcfs",
            "{dataset}",
            "{dataset}_all_samples.g.vcf.gz.tbi"),

        batch_params = os.path.join(cfg.processedDataDir, "rnaVariantCalling/params/batches" ,
                                    "{dataset}_batchParams.csv")
    output:
        vcf = temp(os.path.join( cfg.processedDataDir,
            "rnaVariantCalling/out/batch_vcfs",
            "{dataset}",
            "{dataset}_all_samples.genotyped.raw.vcf.gz")),
        tbi = temp(os.path.join( cfg.processedDataDir,
            "rnaVariantCalling/out/batch_vcfs",
            "{dataset}",
            "{dataset}_all_samples.genotyped.raw.vcf.gz.tbi"))
    params:
        ref = lambda wildcards: cfg.RVC.getGenomePath(wildcards.dataset),
    log:
        str(cfg.processedDataDir) + "/rnaVariantCalling/logs/all_haplocaller/{dataset}_genotypeGVCFs.log"
    shell:
        """
        gatk --java-options "-Djava.io.tmpdir={resources.tmpdir}" GenotypeGVCFs \
        -R {params.ref} --variant {input.gvcf} -O {output.vcf} 2> {log}
        """


#Using GATK combine the vcfs from each sample within a {dataset} into a multi-sample vcf file to improve genotyping and variant calls
rule combineGVCFs:
    input:
        gvcfs = lambda wildcards: expand(
            os.path.join(
            cfg.processedDataDir,
            "rnaVariantCalling/out/sample_vcfs",
            "{sample}",
            "{sample}.g.vcf.gz"
            ), sample = sa.getIDsByGroup(wildcards.dataset)),
        batch_params = os.path.join(cfg.processedDataDir, "rnaVariantCalling/params/batches" ,
                                    "{dataset}_batchParams.csv")

    output:
        gvcf = os.path.join(
            cfg.processedDataDir,
            "rnaVariantCalling/out/batch_vcfs",
            "{dataset}",
            "{dataset}_all_samples.g.vcf.gz"
            ),
        gvcf_tbi = os.path.join(
            cfg.processedDataDir,
            "rnaVariantCalling/out/batch_vcfs",
            "{dataset}",
            "{dataset}_all_samples.g.vcf.gz.tbi")
    params:
        ref = lambda wildcards: cfg.RVC.getGenomePath(wildcards.dataset),
        variant_list = lambda wildcards: [f"--variant {gvcf}" for gvcf in expand(
            os.path.join(
            cfg.processedDataDir,
            "rnaVariantCalling/out/sample_vcfs",
            "{sample}",
            "{sample}.g.vcf.gz"
            ), sample = sa.getIDsByGroup(wildcards.dataset))]
    log:
        str(cfg.processedDataDir) + "/rnaVariantCalling/logs/all_haplocaller/{dataset}_combineGVCF.log"
    shell:
        """
        gatk --java-options "-Djava.io.tmpdir={resources.tmpdir}" CombineGVCFs \
        -R {params.ref} {params.variant_list} -O {output.gvcf} 2> {log}
        """


#Using GATK HaplotypeCaller take the cleaned and recalibrated BAM file as input for the variant calling.
rule haplotypeCaller:
    input:
        bam = os.path.join(
            cfg.processedDataDir,
            "rnaVariantCalling/out/bam",
            "{sample}",
            "{sample}_Aligned.sortedByCoord.dupMarked.split.bqsr.out.bam"),
        bai = os.path.join(
            cfg.processedDataDir,
            "rnaVariantCalling/out/bam",
            "{sample}",
            "{sample}_Aligned.sortedByCoord.dupMarked.split.bqsr.out.bai"),
        script = str(RVC_WORKDIR) + "/GATK_BASH/haplotypeCaller.sh"
    output:
        os.path.join(
        cfg.processedDataDir,
        "rnaVariantCalling/out/sample_vcfs",
        "{sample}",
        "{sample}.g.vcf.gz")
    params:
        ref = lambda wildcards: cfg.RVC.getGenomePath(wildcards.sample),
        ucsc2ncbi = cfg.workDir / "Scripts/Pipeline/chr_UCSC_NCBI.txt",
        ncbi2ucsc = cfg.workDir / "Scripts/Pipeline/chr_NCBI_UCSC.txt",
        hcArgs = getHaploCallerArgs(),
        dbSNP = getKnownVCFs(dbSNP_only = True)
    log:
        str(cfg.processedDataDir) + "/rnaVariantCalling/logs/all_haplocaller/{sample}.log"
    shell:
        """
        {input.script} {input.bam} {input.bai} \
        {params.ref} {params.dbSNP} {params.ucsc2ncbi} {params.ncbi2ucsc} \
        {log} {resources.tmpdir} {output} {params.hcArgs}
        """


#Using GATK ApplyBQSR takes the frequency table and confidence scores generated by BQSR and recalculates the BAM quality scores
rule applyBQSR:
    input:
        bam = os.path.join(
            cfg.processedDataDir,
            "rnaVariantCalling/out/bam",
            "{sample}",
            "{sample}_Aligned.sortedByCoord.dupMarked.split.out.bam"),
        bai = os.path.join(
            cfg.processedDataDir,
            "rnaVariantCalling/out/bam",
            "{sample}",
            "{sample}_Aligned.sortedByCoord.dupMarked.split.out.bai"),
        table = os.path.join(
            cfg.processedDataDir,
            "rnaVariantCalling/out/bqsr/{sample}_recal.table")
    output:
        bam = temp(os.path.join(
            cfg.processedDataDir,
            "rnaVariantCalling/out/bam",
            "{sample}",
            "{sample}_Aligned.sortedByCoord.dupMarked.split.bqsr.out.bam")),
        bai = temp(os.path.join(
            cfg.processedDataDir,
            "rnaVariantCalling/out/bam",
            "{sample}",
            "{sample}_Aligned.sortedByCoord.dupMarked.split.bqsr.out.bai"))
    params:
        ref = lambda wildcards: cfg.RVC.getGenomePath(wildcards.sample),
    log:
        str(cfg.processedDataDir) + "/rnaVariantCalling/logs/applyBQSR/{sample}.log"
    shell:
        """
        gatk --java-options "-Djava.io.tmpdir={resources.tmpdir}" ApplyBQSR  \
        -R {params.ref} -I {input.bam} --bqsr-recal-file {input.table} \
        --add-output-sam-program-record --use-original-qualities -O {output.bam} 2> {log}
        """


#Using GATK BaseRecalibrator (BQSR) use the known sites (dbSNP + others) to improve read scoring
rule bqsr:
    input:
        bam = os.path.join(
            cfg.processedDataDir,
            "rnaVariantCalling/out/bam",
            "{sample}",
            "{sample}_Aligned.sortedByCoord.dupMarked.split.out.bam"),
        bai = os.path.join(
            cfg.processedDataDir,
            "rnaVariantCalling/out/bam",
            "{sample}",
            "{sample}_Aligned.sortedByCoord.dupMarked.split.out.bai"),
        script = str(RVC_WORKDIR) + "/GATK_BASH/bqsr.sh"
    output:
        bqsr_table = os.path.join(
            cfg.processedDataDir,
            "rnaVariantCalling/out/bqsr",
            "{sample}_recal.table")
    params:
        ref = lambda wildcards: cfg.RVC.getGenomePath(wildcards.sample),
        highQualityVCFs = getKnownVCFs(),
        ucsc2ncbi = cfg.workDir / "Scripts/Pipeline/chr_UCSC_NCBI.txt",
        ncbi2ucsc = cfg.workDir / "Scripts/Pipeline/chr_NCBI_UCSC.txt"
    log:
        str(cfg.processedDataDir) + "/rnaVariantCalling/logs/bqsr/{sample}.log"
    shell:
        """
        {input.script} {input.bam} {input.bai} \
        {params.ref} "{params.highQualityVCFs}" {params.ucsc2ncbi} {params.ncbi2ucsc} \
        {log} {resources.tmpdir} {output.bqsr_table}
        """


#Using GATK splitNCigarReads make use of the RNA splicing characteristic by mapping reads with large gaps to the reference. Split the RNAseq reads into subsections that will have better local alignments
rule splitNcigar:
    input:
        bam = os.path.join(
            cfg.processedDataDir,
            "rnaVariantCalling/out/bam",
            "{sample}",
            "{sample}_Aligned.sortedByCoord.dupMarked.FAorder.out.bam"),
        bai = os.path.join(
            cfg.processedDataDir,
            "rnaVariantCalling/out/bam",
            "{sample}",
            "{sample}_Aligned.sortedByCoord.dupMarked.FAorder.out.bai"),
        fai = lambda wildcards: str(cfg.RVC.getGenomePath(wildcards.sample)) + ".fai",
        dict = lambda wildcards: cfg.genome.getFastaDict(cfg.RVC.getGenomePath(wildcards.sample))
    output:
        bam = temp(os.path.join(
            cfg.processedDataDir,
            "rnaVariantCalling/out/bam",
            "{sample}",
            "{sample}_Aligned.sortedByCoord.dupMarked.split.out.bam")),
        bai = temp(os.path.join(
            cfg.processedDataDir,
            "rnaVariantCalling/out/bam",
            "{sample}",
            "{sample}_Aligned.sortedByCoord.dupMarked.split.out.bai"))
    params:
        ref = lambda wildcards: cfg.RVC.getGenomePath(wildcards.sample)
    log:
        str(cfg.processedDataDir) + "/rnaVariantCalling/logs/splitNcigar/{sample}.log"
    shell:
        """
        gatk --java-options "-Djava.io.tmpdir={resources.tmpdir}" SplitNCigarReads \
        -R {params.ref} -I {input.bam} -fixNDN \
        -O {output.bam} 2> {log}
        #-RMQT 60 -U ALLOW_N_CIGAR_READS --allow_potentially_misencoded_quality_scores 2> {log}
        """

# Using picard ReorderSam the bam files so that they match the reference genome order.
rule reorderBAM:
    input:
        bam = os.path.join(
            cfg.processedDataDir,
            "rnaVariantCalling/out/bam",
            "{sample}",
            "{sample}_Aligned.sortedByCoord.dupMarked.out.bam"),
        dict = lambda wildcards: cfg.genome.getFastaDict(cfg.RVC.getGenomePath(wildcards.sample))
    output:
        bam = temp(os.path.join(
            cfg.processedDataDir,
            "rnaVariantCalling/out/bam",
            "{sample}",
            "{sample}_Aligned.sortedByCoord.dupMarked.FAorder.out.bam"
            )),
        bai = temp(os.path.join(
            cfg.processedDataDir,
            "rnaVariantCalling/out/bam",
            "{sample}",
            "{sample}_Aligned.sortedByCoord.dupMarked.FAorder.out.bai"))
    shell:
        """
        echo "ReorderSam"
        gatk --java-options "-Djava.io.tmpdir={resources.tmpdir}" ReorderSam \
        -I {input.bam} -O {output.bam} --SEQUENCE_DICTIONARY {input.dict} -S true --CREATE_INDEX true 
        """

#Using GATK markDuplicates attempt to identify reads that are technical duplicates of biological reads. Attempts to eliminate noise introduced by library prep
rule markDuplicates:
    input:
        bam = os.path.join(
            cfg.processedDataDir,
            "rnaVariantCalling/out/bam",
            "{sample}",
            "{sample}_Aligned.sortedByCoord.out.bam"),
        bai = os.path.join(
            cfg.processedDataDir,
            "rnaVariantCalling/out/bam",
            "{sample}",
            "{sample}_Aligned.sortedByCoord.out.bam.bai")
    output:
        bam = temp(os.path.join(
            cfg.processedDataDir,
            "rnaVariantCalling/out/bam",
            "{sample}",
            "{sample}_Aligned.sortedByCoord.dupMarked.out.bam")),
        bai = temp(os.path.join(
            cfg.processedDataDir,
            "rnaVariantCalling/out/bam",
            "{sample}",
            "{sample}_Aligned.sortedByCoord.dupMarked.out.bai"))
    params:
        metrics = os.path.join( cfg.processedDataDir, "rnaVariantCalling/out/picard-tools-marked-dup-metrics.txt"),
    log:
        str(cfg.processedDataDir) + "/rnaVariantCalling/logs/markDuplicates/{sample}.log"
    shell:
        """
        gatk MarkDuplicates \
        -I {input.bam} -O {output.bam} \
        -M {params.metrics} --CREATE_INDEX true \
        --TMP_DIR "{resources.tmpdir}" \
        --VALIDATION_STRINGENCY SILENT 2> {log}
        """


#Using samtools sort the reads based on their chromosomal coordinates
rule sortBam:
    input:
        bam = os.path.join(
            cfg.processedDataDir,
            "rnaVariantCalling/out/bam",
            "{sample}",
            "{sample}_dropHeader.bam"),
        bai = os.path.join(
            cfg.processedDataDir,
            "rnaVariantCalling/out/bam",
            "{sample}",
            "{sample}_dropHeader.bam.bai")
    output:
        bam = temp(os.path.join(
            cfg.processedDataDir,
            "rnaVariantCalling/out/bam",
            "{sample}",
            "{sample}_Aligned.sortedByCoord.out.bam")),
        bai = temp(os.path.join(
            cfg.processedDataDir,
            "rnaVariantCalling/out/bam",
            "{sample}",
            "{sample}_Aligned.sortedByCoord.out.bam.bai"))
    log:
        str(cfg.processedDataDir) + "/rnaVariantCalling/logs/sortBam/{sample}.log"

    shell:
        """
        samtools sort {input.bam} -O BAM -o {output.bam} &> {log}
        samtools index -b {output.bam}
        """


rule changeHeader:
    input:
        bam = lambda wildcards: sa.getFilePath(wildcards.sample, "RNA_BAM_FILE"),
        bai = lambda wildcards: sa.getFilePath(wildcards.sample, "RNA_BAM_FILE") + ".bai",
        script = str(RVC_WORKDIR) + "/GATK_BASH/changeHeader.sh"
    threads: 1
    output:
        bam = temp(os.path.join(
            cfg.processedDataDir,
            "rnaVariantCalling/out/bam",
            "{sample}",
            "{sample}_dropHeader.bam")),
        bai = temp(os.path.join(
            cfg.processedDataDir,
            "rnaVariantCalling/out/bam",
            "{sample}",
            "{sample}_dropHeader.bam.bai")),
        newHeader = os.path.join(
            cfg.processedDataDir,
            "rnaVariantCalling/out/bam",
            "{sample}",
            "{sample}_newDropHeader.txt")
    params:
        sample="{sample}"
    log:
        str(cfg.processedDataDir) + "/rnaVariantCalling/logs/changeHeader/{sample}.log"
    shell:
        """
        {input.script} {input.bam} {input.bai} {params.sample} {log} \
        {output.bam} {output.bai} {output.newHeader}
        """
