Tuesday, August 19, 2014

bamsplitter

#!/bin/bash

#split the BAM file by chromosome on NameNode's local disk
pathSamtools=$1 #the full path to samtools. e.g /X/Y/Z/samtoools
bamInputFile=$2 #input BAM file name. e.g. /A/B/C/d.bam
bamOutputDir=$3 #output folder for splitted BAM file. The full path will be /A/B/C/E/


#bamInputFile="/A/B/C/d.bam"

#/A/B/C/
bamOutputPath=${bamInputFile%/*}"/"${bamOutputDir}

#"d.bam"
bamFileName=$(basename "$bamInputFile")

#"d"
bamFileNameNoExt=${bamFileName%%.*}

#"/A/B/C/d.bam.bai"
bamFileIndex=${bamInputFile}."bai"

#"/A/B/C/d"
bamFilePathNoExt=${bamInputFile%%.*}

#"bam"
#bamFileNameExt="${bam##*.}"

bam=${bamInputFile}

numCPU=$(grep -c ^processor /proc/cpuinfo)

sorted=$(samtools view -H ${bamInputFile} | grep SO:coordinate)
if [ -z "$sorted" ];then
        #unsorted
        ${pathSamtools} sort ${bamInputFile} ${bamFilePathNoExt}".sort"
        #create BAM index
bam=${bamFilePathNoExt}".sort.bam"
        ${pathSamtools} index ${bam}
else #sorted
        if [ ! -f "${bamFileIndex}" ];then #create BAM index if not exist
        ${pathSamtools} index ${bam}
        fi
fi

#create output folder if not exist
if [ ! -d "${bamOutputPath}" ]; then
        mkdir -p ${bamOutputPath}
fi

#split the BAM by chromosome
for i in `${pathSamtools} view -H ${bam} | awk -F"\t" '/@SQ/{print $2}' |  cut -d":" -f2`
do
${pathSamtools} view -@ ${numCPU} -h -F 0x4 -q 10 ${bam} $i  | \
awk '{if(/^@SQ/ && !/^@SQ\tSN:'$i'/) next} {if(/^@/) print $0}{if ($3~/^'$i'/) print $0}' | \
${pathSamtools} view -@ ${numCPU} -hbS - > ${bamOutputPath}/${bamFileNameNoExt}.$i.bam 2>/dev/null
done

No comments:

Post a Comment