#!/bin/sh


##############################################################################################################################
##### READ ME ################################################################################################################
##############################################################################################################################
	
	# FORENSIC supports only fasta file!
	# WARNING: headers have to be structure like this: 
	# >sampleID|sequenceID (=> sampleID first, then "|", then sequence ID)

	# Please find below the script to trim and merge your fasta.
	# It trim the primers using CUTADAPT and merge the pair-end reads using PEAR
	# All samples will be concatenated in one fasta that is formated for FORENSIC

	# If you want to use that script has a semi-automated script, you need to:
	# 	1. Manually change the parameters between the lines 35 and 43
	#	2. Save the file
	#	3. In the terminal, you need to give the autorization of the script TrimMerge.sh 
	#	For that: chmod +x TrimMerge.sh. 
	# 	Run the script = copy and paste the path of TrimMerge.sh in the terminal


############################################################################################################################## 
##### 1. Check if the softwares are properly installed
cutadapt --version				# http://cutadapt.readthedocs.io/en/stable/
pear --version					# https://sco.h-its.org/exelixis/web/software/pear/
gzip -v 

# You shouldn't get an error message when running the commands below. If yes, you need to re-install them. The easiest way to do it is to use anaconda. 
	# 1. to install conda: https://www.anaconda.com/distribution/
	# 2. to install cutadapt (unix+mac), run the command: conda install -c bioconda cutadapt 
	# 3. to install pear (unix+mac), run the command: conda install -c bioconda pear
	

##############################################################################################################################
##### 2. Parameters that can/have to be changed by the user 

workingPATH="~/WS/test_craig/"  	# Path to the parent directory. It is not the directory where are stored the fastqs. Should finish by "/"
fastq_directory="fastq"	# Name of the folder containing the fastqs. It must be located in the parent directory.
output_file="craig_test.fa"			# Name of the output fasta
region="V4"							# 16S rRNA gene region targeted V4 (EMP modified primer set), V4_original (EMP original primer set) or V6
reads_trimmed="no"					# If sequences have been processed according to the Earth Microbiome protocol, the adaptors/primers have not been sequenced, the flag has thus to be set as "yes". 
R1="_R1"							# How can we discriminate R1 in the filename? _R1? _R1_?
R2="_R2"							# How can we discriminate R2 in the filename? _R2? _R2_?

removeCharacter=""					# Any character you want to delete between the Sample ID and the R1/R2 characters. Example: removeCharacter="_L001" for SampleID_L001_R1.fastq.gz
threads=8							# Number of threads to use

##############################################################################################################################
##############################################################################################################################
##############################################################################################################################


##############################################################################################################################
###### AFTER THAT LINE, YOU DON'T HAVE TO MODIFY ANYTHING!!!##################################################################
###


##############################
######### Preparation ######## 
##############################
alias cdproj="cd $workingPATH$fastq_directory"
cdproj


	### Preparation
if [[ $region == "V6" ]]
then
	primerF="MNAMSCGMNRAACCTYANC"
	primerR="CGACRRCCATGCANCACCT"
	minlength=54
	maxlength=66
elif [[ $region == "V4" ]]
then
	primerF="YCAGCMGCCGCGGTAA"
	primerR="GGACTACNVGGGTWTCTAAT"
	minlength=227	
	maxlength=279
elif [[ $region == "V4_original" ]]
then
	primerF="CCAGCMGCCGCGGTAA"
	primerR="GGACTACHVGGGTWTCTAAT"
	minlength=227	
	maxlength=279
fi


if [[ $reads_trimmed="no" ]]
then
	mkdir ../Cutadapt/; printf "Creation of the directory ${workingPATH}Cutadapt/\n"
	mkdir ../PEAR/; printf "Creation of the directory ${workingPATH}PEAR/\n"
	mkdir ../fasta/; printf "Creation of the directory ${workingPATH}fasta/\n"
	mkdir ../PEAR/temp/


##############################
### Trimming with CUTADAPT ### 
##############################
	printf "\n\n### Trimming using Cutadapt ###\n"

	# Trim reads using cutadapt
	LOG_FILE_CUTADAPT="../Cutadapt/log_cutadapt.log"
	for fileR1 in *$R1*; do
	    fileR2=${fileR1/$R1/$R2}; printf "Trimming of ${fileR1} and ${fileR2} " 1>&3
		cutadapt -e 0.2 -m 50 -g "$primerF" -G "$primerR" -o "../Cutadapt/$fileR1" -p "../Cutadapt/$fileR2" $fileR1 $fileR2; printf "done\n" 1>&3
	done 3>&1 1>>${LOG_FILE_CUTADAPT} 2>&1
	printf "\n\n"

       
	# Export trimmed fastq in a temporary folder in order to be processed
	for file in ../Cutadapt/*; do cp ${file} ../PEAR/temp/; printf "Copied ${file}\n"; done
	cd ../PEAR/temp/


elif [[ $reads_trimmed="yes" ]]
then
	mkdir ../PEAR/; printf "Creation of the directory ${workingPATH}PEAR/\n"
	mkdir ../fasta/; printf "Creation of the directory ${workingPATH}fasta/\n"
	mkdir ../PEAR/temp/

	for file in *; do cp ${file} ../PEAR/temp/; printf "Copied ${file}\n"; done
	cd ../PEAR/temp/
else
 	echo "The flag 'reads_trimmed' is not correctly defined "
fi


###########################
#### Merging with PEAR ####
###########################

	# Preparation
#for file in *.gz; do gzip -d ${file}; printf "Unzipped ${file}\n"; done
for i in *.fastq*; do mv "$i" "${i/$removeCharacter/}"; done


qualitytrimming=0								# Quality score threshold for trimming the low quality part of a read. If the quality scores of two consecutive bases are strictly less than the specified threshold, the rest of the read will be trimmed. (default: 0)
basesN=0										# Maximal proportion of uncalled bases in a read. Setting this value to 0 will discard all reads containing uncalled bases (N). The other extreme setting is 1 which causes PEAR to process all reads independent on the number of uncalled bases. (default: 1)
LOG_FILE_PEAR="../log_PEAR.log"

	# Pair-end merging using pear
printf "\n\n### Merging using PEAR ###\n"

for fileR1 in *$R1*; do
    fileR2=${fileR1/$R1/$R2}
    file_o=${fileR1/$R1*.fastq*/}; printf "Merging ${fileR1} and ${fileR2} " 1>&3
    pear -n $minlength -m $maxlength -q $qualitytrimming -u $basesN -j $threads -f $fileR1 -r $fileR2 -o  "../${file_o}"; printf "done\n" 1>&3
	rm -rf $fileR1 $fileR2 
done 3>&1 1>>${LOG_FILE_PEAR} 2>&1


################################
#### Convert fastq to fasta ####
################################
printf "\n\n ### Convert fastq to fasta ###\n"
cd ..

for file in *.assembled.fastq; do
	file_temp=${file/.assembled.fastq/.temp.fasta} # file_temp=${file/##*.fastq/.temp.fasta} with ## the character you want to use a the delineation in the header 
	cat $file | awk '{if(NR%4==1) {printf(">%s\n",substr($0,2));} else if(NR%4==2) print;}' > "$file_temp"; printf "${file_temp} generated\n"
done
printf "\n\n"


###############################################################
#### Change the header names with the name of the fasta file ####
###############################################################
for file in *.temp.fasta; do 
	file_o=${file/.temp.fasta/.fa}
	SampleName=${file/.temp.fasta/}
	awk '/>/{sub(">","&"FILENAME"|");sub(/\.temp.fasta/,x)}1' $file | awk '{print $1}' | awk '{gsub(/:/,"-")}1' > "$file_o"
done

rm -rf *temp.fasta


#############################################
#### Generation of the final multi-fasta ####
#############################################
cat *.fa > ../$output_file; printf "\n\n${workingPATH}${output_file} generated\n\n"


########################################
#### Clean and reorganize the mess! ####
########################################
for file in *.fastq; do gzip ${file}; printf "Zipped ${file}\n"; done
for file in *.fa; do cp ${file} ../fasta/${file}; printf "${workingPATH}fasta/${file} copied\n"; done


rm -rf *.fa
rm -r temp