Monday, August 5, 2013

Create some artificial SNPs on a reference genome



import random
import sys

random.seed(13115)

def iter_chromosome(fasta):
    m = {}
    seq = []
    chr = ''
    for line in file(fasta):
        line = line.strip()
        if line.startswith('>'):
            if seq: #
                yield chr,''.join(seq)
                seq = []
            chr = line
        else:
            seq.append(line)
    yield chr,''.join(seq)

def make_snp(ref):
    x = ['A','G','C','T']
    x.remove(ref)
    return random.choice(x)

def generate_SNP(fasta,interval=1250):
    import numpy
    exon = {}.fromkeys(('A','G','C','T'))
    snp = []
    fsnp=file('snp.txt','w')
    for chr, seq in iter_chromosome(fasta):
        dup = [s for s in seq]
        N = len(dup)
        num_required=N/interval
        pos = [int(i) for i in numpy.random.normal(interval,interval/5,num_required)]
        index = 0
        for p in pos:
            index = index + p
            try:
                ref = dup[index-1]
                if exon.has_key(ref):
                    alt = make_snp(ref)
                    dup[index-1] = alt
                    snp.append('\t'.join((chr.lstrip('>'),str(index),ref,alt)))
            except IndexError:
                break
        print chr
        for i in xrange(0, len(dup), 50):
            print ''.join(dup[i:i+50])
    print >>fsnp,'\n'.join(snp)
    fsnp.close()
   
fasta = 'hg19.fa'
generate_SNP(fasta,250)



No comments:

Post a Comment