細菌WGS測序數(shù)據(jù):質(zhì)控、過濾、組裝、評估
使用 trimmomatic 去除fastq數(shù)據(jù)的接頭
# 創(chuàng)建 Raw Reads目錄mkdir -p $OUTWD/Raw_reads# 拷貝 Raw Readscp -f ~/wgs_prokaryote/fastq/*fastq.gz $OUTWD/Raw_reads/# 創(chuàng)建 Clean Reads 目錄mkdir -p $OUTWD/cleaned_reads# 查看變量和接頭文件echo $TRIMMOMATIC $CPUS $OUTWD $ADAPTERS ${QUALITY} ${MINLEN}head $ADAPTERS# 用 trimmomatic 循環(huán)、批量去除fastq數(shù)據(jù)的接頭for i in `ls $OUTWD/Raw_reads | sed 's/_/\t/g' | cut -f 1 | sort -u | head -n 4`; do# 用trimmomatic:過濾# $TRIMMOMATIC PE -threads $CPUS -phred33 $OUTWD/Raw_reads/$i\_1.fastq.gz $OUTWD/Raw_reads/$i\_2.fastq.gz \# $OUTWD/cleaned_reads/$i\.ok_1.fastq.gz /dev/null $OUTWD/cleaned_reads/$i\.ok_2.fastq.gz /dev/null \# ILLUMINACLIP:$ADAPTERS:1:30:11 LEADING:${QUALITY} TRAILING:${QUALITY} MINLEN:${MINLEN}# 用trimmomatic:去接頭$TRIMMOMATIC PE -threads $CPUS -phred33 $OUTWD/Raw_reads/$i\_1.fastq.gz $OUTWD/Raw_reads/$i\_2.fastq.gz \$OUTWD/cleaned_reads/$i\.noadapt.R1.fastq.gz /dev/null $OUTWD/cleaned_reads/$i\.noadapt.R2.fastq.gz /dev/null \ILLUMINACLIP:$ADAPTERS:1:30:11done
# 將4個樣本名寫入文件ls $OUTWD/Raw_reads | sed 's/_/\t/g' | cut -f 1 | sort -u | head -n 4 > $OUTWD/reads-list.tmp# 解壓fastq$PARALLEL -j $CPUS --gnu gzip -d ::: $OUTWD/cleaned_reads/*gz# -d, --decompress decompress# 使用 prinseq 程序過濾 fastqecho $PARALLEL $CPUS $OUTWD $PRINSEQ $MINLEN $QUALITY# 默認的過濾閾值(QUALITY=25, MINLEN=125)可能過于嚴格,導(dǎo)致無法生成滿足條件的、合適大小的fastq文件$PARALLEL -j $CPUS -a $OUTWD/reads-list.tmp \perl $PRINSEQ -verbose \-fastq $OUTWD/cleaned_reads/{}.noadapt.R1.fastq -fastq2 $OUTWD/cleaned_reads/{}.noadapt.R2.fastq \-out_good $OUTWD/cleaned_reads/{}.ok -out_format 3 -out_bad null \-min_len $MINLEN -min_qual_mean $QUALITY \-trim_qual_right $QUALITY -trim_qual_window 15 -trim_qual_type mean
be ignored
min_qual_mean: 129
# 刪除中間文件,壓縮結(jié)果文件$PARALLEL -j $CPUS --gnu rm -f ::: $OUTWD/cleaned_reads/*singletons*$PARALLEL -j $CPUS --gnu rm -rf ::: $OUTWD/cleaned_reads/*noadapt*$PARALLEL -j $CPUS --gnu gzip ::: $OUTWD/cleaned_reads/*fastq# Quality filtering statsfor i in `ls $OUTWD/Raw_reads | sed 's/_/\t/g' | cut -f 1 | sort -u | head -n 4`; dozcat $OUTWD/cleaned_reads/$i\.ok_*.fastq.gz | awk 'BEGIN { t=0.0;sq=0.0; n=0;} ;NR%4==2 {n++;L=length($0);t+=L;sq+=L*L;}END{m=t/n;printf("total\tavg\n%d\t%f\n",n,m,sq/n-m*m);}' \> $OUTWD/cleaned_reads/$i\.ok.statssed -i 's/\..*//' $OUTWD/cleaned_reads/$i\.ok.statsdone# Number of reads surviving to log filefor i in `ls $OUTWD/Raw_reads | sed 's/_/\t/g' | cut -f 1 | sort -u | head -n 4`; doecho -e "Number of reads surviving for sample ${i}: \c"cut -f 1 $OUTWD/cleaned_reads/${i}.ok.stats | tail -n 1done
Number of reads surviving for sample ERR025837: 3954038
Number of reads surviving for sample ERR025838: 2890602
Number of reads surviving for sample ERR038741: 12890710
Number of reads surviving for sample SRR057610: 13663540
## Chechking quality with FASTQCfor i in 1 2; do$PARALLEL -j $CPUS -a $OUTWD/reads-list.tmp $FASTQC $OUTWD/cleaned_reads/{}.ok_${i}.fastq.gz -t 1 -o $OUTWD/cleaned_reads/donerm $OUTWD/cleaned_reads/*zip
組 裝
Software for genome assembly chosen: spades (耗時、耗內(nèi)存)
mkdir -p $OUTWD/assemblymkdir -p $OUTWD/genomesecho $CPUS $SPADESfor i in `ls $OUTWD/Raw_reads | sed 's/_/\t/g' | cut -f 1 | sort -u | head -n 4`; dopython $SPADES --careful -1 $OUTWD/cleaned_reads/$i\.ok_1.fastq.gz -2 $OUTWD/cleaned_reads/$i\.ok_2.fastq.gz \-o $OUTWD/assembly/$i\_assembly -t $CPUSdone# Software for genome assembly chosen: megahit# echo $MEGAHIT $CPUS# for i in `ls $OUTWD/Raw_reads | sed 's/_/\t/g' | cut -f 1 | sort -u | head -n 4`; do# $MEGAHIT -1 $OUTWD/cleaned_reads/$i\.ok_1.fastq.gz -2 $OUTWD/cleaned_reads/$i\.ok_2.fastq.gz \# -o $OUTWD/assembly/$i\_assembly -t $CPUS# done# 對組裝后的基因組重命名: final.contigs.fa --> contigs.fastafor i in `ls $OUTWD/Raw_reads | sed 's/_/\t/g' | cut -f 1 | sort -u | head -n 4`; domv $OUTWD/assembly/$i\_assembly/final.contigs.fa $OUTWD/assembly/$i\_assembly/contigs.fastadone
# 啟動 Prinseq$PARALLEL -j $CPUS -a $OUTWD/reads-list.tmp perl $PRINSEQ -fasta $OUTWD/assembly/{}\_assembly/contigs.fasta \-min_len $MINCONTIGLEN -out_good $OUTWD/genomes/{} -out_bad null# 壓縮、刪除之前的組裝結(jié)果# parallel -j $CPUS --gnu rm -rf ::: $OUTWD/genomes/1/$PARALLEL -j $CPUS -a $OUTWD/reads-list.tmp --gnu tar cfz $OUTWD/assembly/{}\_assembly.tgz -C $OUTWD/assembly/{}\_assembly$PARALLEL -j $CPUS -a $OUTWD/reads-list.tmp --gnu rm -rf $OUTWD/assembly/{}\_assembly
mkdir -p $OUTWD/genome_statsfor i in `ls $OUTWD/Raw_reads | sed 's/_/\t/g' | cut -f 1 | sort -u | head -n 4`; do$QUAST $OUTWD/genomes/${i}.fasta -o $OUTWD/genome_stats/${i}_genome_stats -t $CPUS \--min-contig $MINCONTIGLEN --no-icarus --silent --no-svdone

往期精品(點擊圖片直達文字對應(yīng)教程)
后臺回復(fù)“生信寶典福利第一波”或點擊閱讀原文獲取教程合集

評論
圖片
表情



























