#!/usr/bin/make -rRf

# LongStitch: genome assembly correction and scaffolding pipeline using long reads
# Version v1.0.5

# Input files
draft=draft
reads=reads
draft_path=draft_path
draft_path_base=$(shell basename $(draft_path))
reads_path=reads_path
reads_path_base=$(shell basename $(reads_path))


# Find the complete long read file name
fastq_gz=$(shell test -f $(reads).fq.gz && echo "true")
fastq=$(shell test -f $(reads).fq && echo "true")
fastq_long=$(shell test -f $(reads).fastq && echo "true")
fastq_gz_long=$(shell test -f $(reads).fastq.gz && echo "true")

fasta_gz=$(shell test -f $(reads).fa.gz && echo "true")
fasta=$(shell test -f $(reads).fa && echo "true")
fasta_long=$(shell test -f $(reads).fasta && echo "true")
fasta_gz_long=$(shell test -f $(reads).fasta.gz && echo "true")

ifeq ($(fastq_gz), true)
long_reads=$(reads).fq.gz
endif
ifeq ($(fastq), true)
long_reads=$(reads).fq
endif
ifeq ($(fastq_long), true)
long_reads=$(reads).fastq
endif
ifeq ($(fastq_gz_long), true)
long_reads=$(reads).fastq.gz
endif

ifeq ($(fasta_gz), true)
long_reads=$(reads).fa.gz
endif
ifeq ($(fasta), true)
long_reads=$(reads).fa
endif
ifeq ($(fasta_long), true)
long_reads=$(reads).fasta
endif
ifeq ($(fasta_gz_long), true)
long_reads=$(reads).fasta.gz
endif


# Common parameters
z=1000
t=8

# Default Tigmint parameters
span=auto
dist=auto
G=0
longmap=ont

# Default ntLink parameters
k_ntLink=32
w=100
conservative=True
a_ntLink=1
gap_fill=False
rounds=1

# Default ARCS/ARKS+LINKS parameters
s=70
j=0.05
k_arks=20
c=4
l=4
a=0.3

# Fixed parameters; set for naming purposes
cut=250
n=2
m=8-10000
r=0.05
e=30000
D=true

# Reference for QUAST run
ref=None
quast_t=48

# Output prefix for final scaffolds file
out_prefix=None

# Use pigz or bgzip for parallel compression if available.
ifneq ($(shell command -v pigz),)
gzip=pigz -p$t
else
ifneq ($(shell command -v bgzip),)
gzip=bgzip -@$t
else
gzip=gzip
endif
endif

# Record run time and memory usage in a file using GNU time
track_time=0
ifeq ($(track_time), 0)
longstitch_time=
else
ifneq ($(shell command -v gtime),)
longstitch_time=command gtime -v -o $@.time
else
longstitch_time=command time -v -o $@.time
endif
ifneq ($(shell command -v memusg),)
longstitch_time=command memusg -t -o $@.time
endif
endif

# Determine path to LongStitch executables
bin=$(shell dirname `command -v $(MAKEFILE_LIST)`)

.PHONY: help run version clean tigmint ntLink tigmint-ntLink ntLink-with-tigmint tigmint-arcs tigmint-arks \
		arcs-with-tigmint arks-with-tigmint ntLink-arcs ntLink-arks arcs-with-ntLink arks-with-ntLink \
		tigmint-ntLink-arcs tigmint-ntLink-arks arcs-with-tigmint-ntLink arks-with-tigmint-ntLink \
		check_ref longstitch_quast make_links
.DELETE_ON_ERROR:
.SECONDARY:

# Help
help:
	@echo "LongStitch v1.0.5"
	@echo ""
	@echo "Usage: ./longstitch [COMMAND] [OPTION=VALUE]…"
	@echo ""
	@echo "For example, to run the default pipeline on a draft assembly draft-assembly.fa with the reads reads.fa.gz and a genome size of gsize:"
	@echo "longstitch run draft=draft-assembly reads=reads G=gsize"
	@echo ""
	@echo "	Commands:"
	@echo ""
	@echo "	run     		run default LongStitch pipeline: Tigmint, then ntLink"
	@echo ""
	@echo "	tigmint-ntLink-arks	run full LongStitch pipeline: Tigmint, ntLink, then ARCS in kmer mode"
	@echo "	tigmint-ntLink		run Tigmint, then ntLink (Same as 'run' target)"
	@echo "	ntLink-arks		run ntLink, then run ARCS in kmer mode"
	@echo ""
	@echo "	General options (required):"
	@echo "	draft			draft name [draft]. File must have .fa extension"
	@echo "	reads			read name [reads]. The reads file can be uncompressed or gzipped."
	@echo "				Accepted read file extensions: .fq, .fq.gz, .fastq, .fastq.gz, .fa, .fa.gz, .fasta, .fasta.gz"
	@echo ""
	@echo "	General options (optional):"
	@echo "	t			number of threads [8]"
	@echo "	z			minimum size of contig (bp) to scaffold [1000]"
	@echo "	out_prefix		if supplied, final scaffolds will be soft-linked to <out_prefix>.scaffolds.fa"
	@echo ""
	@echo "	Tigmint options:"
	@echo "	span			min number of spanning molecules to be considered correctly assembled [auto]"
	@echo "	dist			maximum distance between alignments to be considered the same molecule [auto]"
	@echo "	G			haploid genome size (bp) for calculating span parameter (e.g. '3e9' for human genome). Required when span=auto [0]"
	@echo "	longmap			long read technology - used for minimap2 preset. 'ont' for nanopore, 'pb' for pacbio, 'hifi' for pacbio HiFi reads [ont]"
	@echo ""
	@echo "	ntLink options:"
	@echo "	k_ntLink		k-mer size for minimizers [32]"
	@echo "	w			window size for minimizers [100]"
	@echo "	gap_fill	        use gap-filling feature [False]"
	@echo "	rounds			number of ntLink rounds [1]"
	@echo ""
	@echo "	ARCS+LINKS options:"
	@echo "	j			minimum fraction of read kmers matching a contigId (used in kmer mode) [0.05]"
	@echo "	k_arks			size of a k-mer (used in kmer mode) [20]"
	@echo "	c			minimum aligned read pairs per molecule [4]"	
	@echo "	l			minimum number of links to compute scaffold [4]"
	@echo "	a			maximum link ratio between two best contain pairs [0.3]"
	@echo ""
	@echo "Notes:"
	@echo "	- by default, span is automatically calculated as 1/4 of the sequence coverage of the input long reads"
	@echo "	- G (genome size) must be specified if span=auto"
	@echo "	- by default, dist is automatically calculated as p5 of the input long read lengths"
	@echo "	- Ensure that all input files are in the current working directory, making soft-links if needed"

clean:
	rm -f *.amb *.ann *.bwt *.pac *.sa *.dist.gv *.fai *.bed *.molecule.tsv *.sortbx.bai *.k$(k).w$(w).tsv *.k$(k).w$(w).tsv
	@echo "Clean Done"

version:
	@echo "LongStitch v1.0.5"
	@echo "Written by Lauren Coombe, Janet Li and Theodora Lo"

# Make soft links for files
make_links: $(draft_path_base) $(reads_path_base)

$(draft_path_base): $(draft_path)
ifeq ($(draft_path),draft_path)
	$(error For make_links target, must set draft_path)
endif
	ln -s $(draft_path)

$(reads_path_base): $(reads_path)
ifeq ($(reads_path),reads_path)
	$(error For make_links target, must set reads_path)
endif
	ln -s $(reads_path)

# Run tigmint-long
run: tigmint-ntLink
tigmint-ntLink-arcs: tigmint-ntLink arcs-with-tigmint-ntLink
tigmint-ntLink-arks: tigmint-ntLink arks-with-tigmint-ntLink
tigmint-ntLink: tigmint ntLink-with-tigmint
tigmint-arcs: tigmint arcs-with-tigmint
tigmint-arks: tigmint arks-with-tigmint
tigmint: $(draft).cut$(cut).tigmint.fa
	
$(draft).cut$(cut).tigmint.fa: $(draft).fa $(long_reads)
	$(longstitch_time) tigmint-make tigmint-long draft=$(draft) reads=$(reads) cut=$(cut) t=$t G=$G span=$(span) dist=$(dist) longmap=$(longmap)

# Run ntLink
ntLink-with-tigmint: $(draft).cut$(cut).tigmint.fa.k$(k_ntLink).w$w.z$z.ntLink.scaffolds.fa \
	$(draft).k$(k_ntLink).w$(w).tigmint-ntLink.longstitch-scaffolds.fa
ntLink-arcs: ntLink arcs-with-ntLink
ntLink-arks: ntLink arks-with-ntLink
ntLink: $(draft).fa.k$(k_ntLink).w$w.z$z.ntLink.scaffolds.fa

ntLink_targets=
ifeq ($(rounds), 1)
ntLink_targets+=scaffold
ifeq ($(gap_fill), True)
ntLink_targets+=gap_fill
endif
else
ifeq ($(gap_fill), True)
ntLink_targets+=run_rounds_gaps
else
ntLink_targets+=run_rounds
endif
endif

%.tigmint.fa.k$(k_ntLink).w$w.z$z.ntLink.scaffolds.fa: %.tigmint.fa $(long_reads)
ifneq ($(rounds), 1)
	$(longstitch_time) ntLink_rounds $(ntLink_targets) rounds=$(rounds) target=$< reads="$(long_reads)" t=$t k=$(k_ntLink) w=$w z=$z n=$n a=$(a_ntLink) conservative=$(conservative)
	ln -sf $*.tigmint.fa.k$(k_ntLink).w$w.z$z.ntLink.$(rounds)rounds.fa $@
else
	$(longstitch_time) ntLink $(ntLink_targets) target=$< reads="$(long_reads)" t=$t k=$(k_ntLink) w=$w z=$z n=$n a=$(a_ntLink) conservative=$(conservative)
endif

$(draft).fa.k$(k_ntLink).w$w.z$z.ntLink.scaffolds.fa: $(draft).fa $(long_reads)
ifneq ($(rounds), 1)
	$(longstitch_time) ntLink_rounds $(ntLink_targets) rounds=$(rounds) target=$< reads="$(long_reads)" t=$t k=$(k_ntLink) w=$w z=$z n=$n a=$(a_ntLink) conservative=$(conservative)
	ln -sf $(draft).fa.k$(k_ntLink).w$w.z$z.ntLink.$(rounds)rounds.fa $@
else
	$(longstitch_time) ntLink $(ntLink_targets) target=$< reads="$(long_reads)" t=$t k=$(k_ntLink) w=$w z=$z n=$n a=$(a_ntLink) conservative=$(conservative)
endif

$(draft).k$(k_ntLink).w$(w).tigmint-ntLink.longstitch-scaffolds.fa: $(draft).cut$(cut).tigmint.fa.k$(k_ntLink).w$w.z$z.ntLink.scaffolds.fa 
	ln -sf $< $@
	echo "Done LongStitch steps Tigmint-long and ntLink! Scaffolds can be found in: $@"
ifneq ($(out_prefix), None)
	ln -sf $< $(out_prefix).scaffolds.fa
endif

# Run arcs-long
arcs-with-ntLink: $(draft).fa.k$(k_ntLink).w$w.z$z.ntLink.scaffolds_c$c_m$m_cut$(cut)_s$s_r$r_e$e_z$z_l$l_a$a.scaffolds.fa
arcs-with-tigmint: $(draft).cut$(cut).tigmint_c$c_m$m_cut$(cut)_s$s_r$r_e$e_z$z_l$l_a$a.scaffolds.fa
arcs-with-tigmint-ntLink: $(draft).cut$(cut).tigmint.fa.k$(k_ntLink).w$w.z$z.ntLink.scaffolds_c$c_m$m_cut$(cut)_s$s_r$r_e$e_z$z_l$l_a$a.scaffolds.fa

%.ntLink.scaffolds_c$c_m$m_cut$(cut)_s$s_r$r_e$e_z$z_l$l_a$a.scaffolds.fa: %.ntLink.scaffolds.fa $(long_reads)
	$(longstitch_time) arcs-make arcs-long draft=$*.ntLink.scaffolds reads=$(reads) m=$m cut=$(cut) s=$s l=$l c=$c a=$a D=$D z=$z

%.tigmint_c$c_m$m_cut$(cut)_s$s_r$r_e$e_z$z_l$l_a$a.scaffolds.fa: %.tigmint.fa $(long_reads)
	$(longstitch_time) arcs-make arcs-long draft=$*.tigmint reads=$(reads) m=$m cut=$(cut) s=$s l=$l c=$c a=$a D=$D z=$z

%.ntLink.scaffolds_c$c_m$m_cut$(cut)_s$s_r$r_e$e_z$z_l$l_a$a.scaffolds.fa: %.ntLink.scaffolds.fa $(long_reads)
	$(longstitch_time) arcs-make arcs-long draft=$*.ntLink.scaffolds reads=$(reads) m=$m cut=$(cut) s=$s l=$l c=$c a=$a D=$D z=$z

# Run with arks-long
arks-with-ntLink: $(draft).fa.k$(k_ntLink).w$w.z$z.ntLink.scaffolds_c$c_m$m_cut$(cut)_k$(k_arks)_r$r_e$e_z$z_l$l_a$a.scaffolds.fa \
	$(draft).k$(k_ntLink).w$(w).ntLink-arks.longstitch-scaffolds.fa
arks-with-tigmint: $(draft).cut$(cut).tigmint_c$c_m$m_cut$(cut)_k$(k_arks)_r$r_e$e_z$z_l$l_a$a.scaffolds.fa
arks-with-tigmint-ntLink: $(draft).cut$(cut).tigmint.fa.k$(k_ntLink).w$w.z$z.ntLink.scaffolds_c$c_m$m_cut$(cut)_k$(k_arks)_r$r_e$e_z$z_l$l_a$a.scaffolds.fa \
			$(draft).k$(k_ntLink).w$(w).tigmint-ntLink-arks.longstitch-scaffolds.fa

$(draft).k$(k_ntLink).w$(w).tigmint-ntLink-arks.longstitch-scaffolds.fa: $(draft).cut$(cut).tigmint.fa.k$(k_ntLink).w$w.z$z.ntLink.scaffolds_c$c_m$m_cut$(cut)_k$(k_arks)_r$r_e$e_z$z_l$l_a$a.scaffolds.fa
	ln -sf $< $@
	echo "Done LongStitch steps Tigmint-long, ntLink and ARKS-long! Scaffolds can be found in: $@"
ifneq ($(out_prefix), None)
	ln -sf $< $(out_prefix).scaffolds.fa
endif

$(draft).k$(k_ntLink).w$(w).ntLink-arks.longstitch-scaffolds.fa: $(draft).fa.k$(k_ntLink).w$w.z$z.ntLink.scaffolds_c$c_m$m_cut$(cut)_k$(k_arks)_r$r_e$e_z$z_l$l_a$a.scaffolds.fa
	ln -sf $< $@
	echo "Done LongStitch steps ntLink and ARKS-long! Scaffolds can be found in: $@"
ifneq ($(out_prefix), None)
	ln -sf $< $(out_prefix).scaffolds.fa
endif

%.ntLink.scaffolds_c$c_m$m_cut$(cut)_k$(k_arks)_r$r_e$e_z$z_l$l_a$a.scaffolds.fa: %.ntLink.scaffolds.fa $(long_reads)
	$(longstitch_time) arcs-make arks-long draft=$*.ntLink.scaffolds reads=$(reads) m=$m cut=$(cut) j=$j k=$(k_arks) l=$l c=$c a=$a D=$D z=$z

%.tigmint_c$c_m$m_cut$(cut)_k$(k_arks)_r$r_e$e_z$z_l$l_a$a.scaffolds.fa: %.tigmint.fa $(long_reads)
	$(longstitch_time) arcs-make arks-long draft=$(draft).cut$(cut).tigmint reads=$(reads) m=$m cut=$(cut) j=$j k=$(k_arks) l=$l c=$c a=$a D=$D z=$z

%.ntLink.scaffolds_c$c_m$m_cut$(cut)_k$(k_arks)_r$r_e$e_z$z_l$l_a$a.scaffolds.fa: %.ntLink.scaffolds.fa $(long_reads)
	$(longstitch_time) arcs-make arks-long draft=$*.ntLink.scaffolds reads=$(reads) m=$m cut=$(cut) j=$j k=$(k_arks) l=$l c=$c a=$a D=$D z=$z

# Pre-processing long reads; cut into shorter segments and optionally calculate tigmint-long dist and span parameters
$(reads).cut$(cut).fq.gz: $(long_reads)
ifeq ($(span), auto)
ifneq ($G, 0)
ifeq ($(dist), auto)
	$(longstitch_time) sh -c '$(bin)/src/long-to-linked-pe -l$(cut) -g$G -s -d -f $(reads).tigmint-long.params.tsv $< | $(gzip) > $@'
else
	$(longstitch_time) sh -c '$(bin)/src/long-to-linked-pe -l$(cut) -g$G -s -f $(reads).tigmint-long.params.tsv $< | $(gzip) > $@'
endif
else
	$(error ERROR: genome size (G) must be provided to calculate span parameter in long-to-linked-pe)
endif
else
ifeq ($(dist), auto)
	$(longstitch_time) sh -c '$(bin)/src/long-to-linked-pe -l$(cut) -g$G -d -f $(reads).tigmint-long.params.tsv $< | $(gzip) > $@'
else
	$(longstitch_time) sh -c '$(bin)/src/long-to-linked-pe -l$(cut) -g$G --f $(reads).tigmint-long.params.tsv $<| $(gzip) > $@'
endif
endif

# QUAST targets

longstitch_quast: arks-with-tigmint-ntLink check_ref \
			quast_longstitch-z$z_ntLink-k$(k_ntLink)-w$w_arks-long

ntLink_quast: ntLink check_ref \
			quast_ntLink-z$z_ntLink-k$(k_ntLink)-w$w

check_ref:
ifeq ($(ref), None)
	$(error ERROR: Must set reference for longstitch_quast target)
endif

quast_longstitch-z$z_ntLink-k$(k_ntLink)-w$w_arks-long: arks-with-tigmint-ntLink
	quast -t $(quast_t) -o $@ -r $(ref) --fast --scaffold-gap-max-size 100000  --large \
		$(draft).fa  \
		$(draft).cut$(cut).tigmint.fa \
		$(draft).cut$(cut).tigmint.fa.k$(k_ntLink).w$w.z$z.ntLink.scaffolds.fa \
		$(draft).cut$(cut).tigmint.fa.k$(k_ntLink).w$w.z$z.ntLink.scaffolds_c$c_m$m_cut$(cut)_k$(k_arks)_r$r_e$e_z$z_l$l_a$a.scaffolds.fa

quast_ntLink-z$z_ntLink-k$(k_ntLink)-w$w: ntLink
	quast -t $(quast_t) -o $@ -r $(ref) --fast --scaffold-gap-max-size 100000  --large \
	$(draft).fa  \
	$(draft).fa.k$(k_ntLink).w$w.z$z.ntLink.scaffolds.fa
