Monday, February 7, 2011

Craete GATK compatible dbSNP file

$ lftp ftp.ncbi.nih.gov/snp/organisms/human_9606/VCF/v4.0/ByChromosomeNoGeno/00-All.vcf.gz
$ gunzip 00-All.vcf.gz
$ grep -c '#' 00-All.vcf
57

$ tail -n+58 00-All.vcf | awk '$0 !~ "_random" {print}' | awk '$0 !~ "_hap" {print}'| awk '$0 !~ "PAR" {print}' | awk '{sub(/MT/, "M", $0); print}' | awk '{print "chr" $0}'> dbsnp132.human.body.vcf

$ awk '{print $1}' dbsnp132.human.body.vcf | sort -u

chr1
chr10
chr11
chr12
chr13
chr14
chr15
chr16
chr17
chr18
chr19
chr2
chr20
chr21
chr22
chr3
chr4
chr5
chr6
chr7
chr8
chr9
chrM
chrX
chrY


$ head -n 57 00-All.vcf > dbsnp132.human.head.vcf

$ cat dbsnp132.human.head.vcf dbsnp132.human.body.vcf > dbsnp132.human.vcf

$ rm dbsnp132.human.body.vcf dbsnp132.human.head.vcf

No comments:

Post a Comment