This script follows from A0.get_samples.sh to process RNA-seq sequencing data, including adapter trimming, alignment, duplicate removal, and quality control calculation comparing raw vs. processed.

R0_process_rna.sh

Arguments

PREFIX

sample directory where fastq files are located in a" directory fastq (thanks to 00.get-samples.sh) "

THREADS

allocated threads for the job; note that memory amt" proportional to number of threads / 20 * 128G (Ruddle)" which ends up being about 6GB per thread"

Details

This script requires the following:

  • cutadapt 1.10

  • samtools 1.3.1, htslib 1.3.1

  • fastqc 0.11.2

  • quip 1.1.8

Examples


#' Preset parameters
MEMPERTHREAD=6    # GB; 128 GB of RAM / 20 cores per node
TRUSEQ_LEFT=AGATCGGAAGAGCACACGTCTGAACTCCAGTCAC
TRUSEQ_RIGHT=AGATCGGAAGAGCGTCGTGTAGGGAAAGAGTGTAGATCTCGGTGGTCGCCGTATCATT

#' 0. Read in Parameters
SDIR=$(readlink -f $1)            # Sample directory
PREFIX=$(basename $SDIR)          # prefix for files 
THREADS=$2

#' 0. Echo parameters back
echo ""
echo "Beginning to process your paired-end RNA-seq data.."
echo ".. Memory per thread is: $MEMPERTHREAD GB"
echo ".. Cutting the TruSeq Adapters (see cutadapt documentation)"
echo ""
echo "Working on: $SDIR"
echo ".. Using prefix: $PREFIX"
echo ".. Using $THREADS threads"
echo ""


#' 1. Concatenate fastq files into R1 and R2
echo "01. Concatenating fastq files together.."
mkdir -p $SDIR/raw
cd $SDIR
# cat fastq/*R1*.fastq.gz > raw/${PREFIX}_R1.fastq.gz
# cat fastq/*R2*.fastq.gz > raw/${PREFIX}_R2.fastq.gz
cat <(quip --stdout fastq/*R1*.fastq.qp) <(gunzip --stdout fastq/*R1*.fastq.gz) | gzip - > raw/${PREFIX}_R1.fastq.gz
cat <(quip --stdout fastq/*R2*.fastq.qp) <(gunzip --stdout fastq/*R2*.fastq.gz) | gzip - > raw/${PREFIX}_R2.fastq.gz


#' 2. Cut adapters (TruSeq)
echo "02. Cutting adapters and trimming poor quality ends.."
mkdir -p $SDIR/cutadapt
cd $SDIR/cutadapt

cutadapt \
    -a $TRUSEQ_LEFT -A $TRUSEQ_RIGHT \
    --quality-cutoff=30,20 \
    -o ${PREFIX}_R1.cut.fastq -p ${PREFIX}_R2.cut.fastq \
    ../raw/${PREFIX}_R1.fastq.gz ../raw/${PREFIX}_R2.fastq.gz > ${PREFIX}_cutadapt-report.txt

echo ".. Zipping up the cut and trimmed sequences.."
gzip -f ${PREFIX}_R1.cut.fastq
gzip -f ${PREFIX}_R2.cut.fastq


#' 3. Quality Control
# echo "03. Running FastQC on raw and processed data.."
# mkdir -p $SDIR/FastQC
# cd $SDIR/FastQC
# mkdir -p fqc_raw; mkdir -p fqc_cutadapt

# echo ".. Processing raw data.."
# zcat ../raw/${PREFIX}*fastq.gz | fastqc stdin -o fqc_raw --noextract -t $THREADS
# mv fqc_raw/stdin_fastqc.zip ${PREFIX}_raw_fastqc.zip

# echo ".. Processing cut data.."
# zcat ../cutadapt/${PREFIX}*cut.fastq.gz | fastqc stdin -o fqc_cutadapt --noextract -t $THREADS
# mv fqc_cutadapt/stdin_fastqc.zip ${PREFIX}_cutadapt_fastqc.zip


#' 4. Success?
echo ""
echo "Pipeline completed."