Friday, August 13, 2010

Pipeline for Genomic Variants Identification - Part 2

#use Lyon Robison Lab's aligned data

wget http://soap.genomics.org.cn/down/soap2sam.tar.gz soapaligner >wget http://soap.genomics.org.cn/down/SOAPaligner-v2.20-Linux-x86_64.tar.bz2">https://hci-as1.hci.utah.edu/gnomex/gnomexFlex.jsp?launchWindow=AnalysisDetail&analysisNumber=A120

# ~3.86GB
gnomex-analysis-20100813.zip

#after unzip, it goes to 3.7G
>unzip gnomex-analysis-20100813.zip
>du -sh bioinformatics-analysis-A121/

-rw-r--r-- 1 97740 1979 453778525 2010-06-24 09:57 1_84060.novoalign


>pwd
/home/galaxy/tmp/bioinformatics-analysis-A121/84060/ShellScripts

#just check the first fragment(1_84060.novoalign, ~450MB) generate by 1.sh
>less 1.sh
#PBS -l nodes=1:ppn=4,walltime=12:00:00
#PBS -m a
#PBS -M hello@world.com
#PBS -N 1_84060
#PBS -j oe
#PBS -o /XXX/Logs

echo '1_84060'
echo -n 'Start: '; date +%s
/uufs/chpc.utah.edu/common/home/u0028003/BioApps/Novocraft/novocraft/novoalign -F ILMFQ -t120 -r0.2 -q5 -d /uufs/chpc.utah.edu/common/home/u0028003/Genomes/hg19Splices34bpAdaptersNovo.index -f /scratch/serial/u0028003/84060/SplitFastqData1/1.txt /scratch/serial/u0028003/84060/SplitFastqData2/1.txt > /scratch/serial/u0028003/84060/Alignments/1_84060.novoalign

gzip /scratch/serial/u0028003/84060/Alignments/1_84060.novoalign
echo -n 'End: '; date +%s


>gunzip 1_84060.novoalign.gz
>wc -l 1_84060.novoalign



>tail -n 100 2140441 1_84060.novoalign
# Paired Reads: 1000000
# Pairs Aligned: 867741
# Read Sequences: 2000000
# Aligned: 1765701
# Unique Alignment: 1636245
# Gapped Alignment: 5509
# Quality Filter: 27456
# Homopolymer Filter: 13
# Elapsed Time: 345,966s
# Fragment Length Distribution
# From To Count
# 72 77 487
# 78 83 1103
# Mean 201, Std Dev 59.4
# Done.

No comments:

Post a Comment