#split the BAM file by chromosome on NameNode's local disk
pathSamtools=$1 #the full path to samtools. e.g /X/Y/Z/samtoools
bamInputFile=$2 #input BAM file name. e.g. /A/B/C/d.bam
bamOutputDir=$3 #output folder for splitted BAM file. The full path will be /A/B/C/E/
#bamInputFile="/A/B/C/d.bam"
#/A/B/C/
bamOutputPath=${bamInputFile%/*}"/"${bamOutputDir}
#"d.bam"
bamFileName=$(basename "$bamInputFile")
#"d"
bamFileNameNoExt=${bamFileName%%.*}
#"/A/B/C/d.bam.bai"
bamFileIndex=${bamInputFile}."bai"
#"/A/B/C/d"
bamFilePathNoExt=${bamInputFile%%.*}
#"bam"
#bamFileNameExt="${bam##*.}"
bam=${bamInputFile}
numCPU=$(grep -c ^processor /proc/cpuinfo)
sorted=$(samtools view -H ${bamInputFile} | grep SO:coordinate)
if [ -z "$sorted" ];then
#unsorted
${pathSamtools} sort ${bamInputFile} ${bamFilePathNoExt}".sort"
#create BAM index
bam=${bamFilePathNoExt}".sort.bam"
${pathSamtools} index ${bam}
else #sorted
if [ ! -f "${bamFileIndex}" ];then #create BAM index if not exist
${pathSamtools} index ${bam}
fi
fi
#create output folder if not exist
if [ ! -d "${bamOutputPath}" ]; then
mkdir -p ${bamOutputPath}
fi
#split the BAM by chromosome
for i in `${pathSamtools} view -H ${bam} | awk -F"\t" '/@SQ/{print $2}' | cut -d":" -f2`
do
${pathSamtools} view -@ ${numCPU} -h -F 0x4 -q 10 ${bam} $i | \
awk '{if(/^@SQ/ && !/^@SQ\tSN:'$i'/) next} {if(/^@/) print $0}{if ($3~/^'$i'/) print $0}' | \
${pathSamtools} view -@ ${numCPU} -hbS - > ${bamOutputPath}/${bamFileNameNoExt}.$i.bam 2>/dev/null
done
No comments:
Post a Comment