******************************************************************************
#1. download dbSNP mouse
wget --continue --no-host-directories --no-directories --preserve-permissions ftp://ftp.ncbi.nih.gov/snp/organisms/mouse_10090/chr_rpts/*
#2. unzip
gunzip *.gz
#3. exclude all SNPs that are not mapped to mm9 (MGSCv37) genome.
for i in `ls`; do grep 'MGSCv37' "$i" > mm9."$i" ; done
#However, you can NOT find REF/ALT field in above files. So we must get it from other files.
#4. download dbSNP mouse with genotype
wget --continue --no-host-directories --no-directories --preserve-permissions ftp://ftp.ncbi.nih.gov/snp/organisms/mouse_10090/genotype/*
#5. unzip
gunzip *.gz
#6. check out these files, should end with ".xml"
ls
Black6GtyFromContig gt_chr13.xml gt_chr17.xml gt_chr2.xml gt_chr6.xml gt_chrAltOnly.xml gt_chrUn.xml
gt_chr10.xml gt_chr14.xml gt_chr18.xml gt_chr3.xml gt_chr7.xml gt_chrMT.xml gt_chrX.xml
gt_chr11.xml gt_chr15.xml gt_chr19.xml gt_chr4.xml gt_chr8.xml gt_chrMulti.xml gt_chrY.xml
gt_chr12.xml gt_chr16.xml gt_chr1.xml gt_chr5.xml gt_chr9.xml gt_chrNotOn.xml README
#7. make a 3 column file: "id REF ALT"
grep '
#8. write a simple python script that mapping id from mm9.chr_N.txt to make the final VCF file - "mm9.dbsnp.vcf"
def mm9():
m = {}
for line in file('id.allele.txt'):
line = line.strip()
toks=line.split(' ')
if len(toks)==3:
m[toks[0]] = (toks[1],toks[2])
#chr1 - chr19, chr X,Y and MT only.
nfs = [str(i) for i in range(1,20)]
nfs.append('X')
nfs.append('Y')
nfs.append('MT')
fo = file('mm9.dbsnp.vcf','w')
print >>fo,'##fileformat=VCFv4.0'
print >>fo,'##fileDate=20110712'
print >>fo,'##source=dbSNP'
print >>fo,'##dbSNP_BUILD_ID=132'
print >>fo,'##reference=mm9'
print >>fo,'#CHROM POS ID REF ALT QUAL FILTER INFO'
total = 0
for n in nfs:
sm = {}
fn = os.path.join('mm9.chr_%s.txt'%n)
for line in file(fn):
line = line.strip()
toks=line.split('\t')
try:
rs_id = toks[0]
mapped = toks[1]
withdraw =toks[2]
chr =toks[6]
position =toks[11]
if m.has_key(rs_id) and mapped=='2' and withdraw=='0':
ref,alt = m[rs_id]
#use an arbitratry "ms"+ID as prefix refID (instead of "rs")
sm[int(position)] = [chr,position,'ms'+rs_id,ref,alt,'.','PASS','.']
except Exception,e:
pass
sk = sorted(sm.keys())
for k in sk:
total += 1
print >>fo,'\t'.join(sm[k])
fo.close()
print total
mm9()
#9. make sure no duplicated IDs
awk '{print $3}' mm9.dbsnp.vcf | sort| uniq -d
13745646
13501994