Skip to content

Commit

Permalink
Support BAMs with >65535 CIGAR operations
Browse files Browse the repository at this point in the history
Due to a design flaw, the original BAM format is unable to store an alignment
with >65535 CIGAR operations. The SAM/BAM specification maintainers have
decided to move the actual CIGAR to a CG optional tag and write a fake CIGAR
`<readLen>S<refLen>N` at the original CIGAR place.

This PR recognizes the CG tag and seamlessly moves the real CIGAR back to its
right place and update the `bin` field accordingly. Library users need not take
any actions.

The convert and sort commands of command-line bamtools have been tested on BAMs
containing the CG tag.
  • Loading branch information
lh3 authored and pezmaster31 committed Dec 1, 2017
1 parent 48233a2 commit 3705ed2
Show file tree
Hide file tree
Showing 3 changed files with 168 additions and 14 deletions.
91 changes: 90 additions & 1 deletion src/api/internal/bam/BamReader_p.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -246,6 +246,91 @@ void BamReaderPrivate::LoadHeaderData()
m_header.Load(&m_stream);
}

static inline int bam_aux_type2size(int x)
{
if (x == 'C' || x == 'c' || x == 'A') return 1;
else if (x == 'S' || x == 's') return 2;
else if (x == 'I' || x == 'i' || x == 'f') return 4;
else return 0;
}

static unsigned char *bam_aux_get(int aux_data_len, const unsigned char *aux_start, const char *tag)
{
const unsigned char *p = aux_start;
while (p < aux_start + aux_data_len) {
if (p[0] == tag[0] && p[1] == tag[1]) return (unsigned char*)(p + 2);
p += 2; // skip tag
int type = *p++; // read type
if (type == 'B') {
int size = bam_aux_type2size(*p++); // read array type
unsigned len = (unsigned)p[0] | (unsigned)p[1]<<8 | (unsigned)p[2]<<16 | (unsigned)p[3]<<24;
p += 4; // skip the size field
p += len * size; // skip array
} else if (type == 'Z' || type == 'H') {
while (*p++ != 0) {} // skip NULL terminated string
} else {
p += bam_aux_type2size(type); // skip value
}
}
return NULL;
}

static inline int hts_reg2bin(int64_t beg, int64_t end, int min_shift, int n_lvls)
{
int l, s = min_shift, t = ((1<<((n_lvls<<1) + n_lvls)) - 1) / 7;
for (--end, l = n_lvls; l > 0; --l, s += 3, t -= 1<<((l<<1)+l))
if (beg>>s == end>>s) return t + (beg>>s);
return 0;
}

bool BamReaderPrivate::Tag2Cigar(BamAlignment &a, RaiiBuffer &buf)
{
if (a.RefID < 0 || a.Position < 0 || a.SupportData.NumCigarOperations == 0) return false;

const unsigned char *data = (const unsigned char*)buf.Buffer;
const unsigned data_len = a.SupportData.BlockLength - Constants::BAM_CORE_SIZE;
const unsigned char *p = data + a.SupportData.QueryNameLength; // the original CIGAR
unsigned cigar1 = (unsigned)p[0] | (unsigned)p[1]<<8 | (unsigned)p[2]<<16 | (unsigned)p[3]<<24;
if ((cigar1&0xf) != 4 || cigar1>>4 != a.SupportData.QuerySequenceLength) return false;

const int seq_offset = a.SupportData.QueryNameLength + a.SupportData.NumCigarOperations * 4;
const int aux_offset = seq_offset + (a.SupportData.QuerySequenceLength + 1) / 2 + a.SupportData.QuerySequenceLength;
unsigned char *CG = bam_aux_get(data_len - aux_offset, data + aux_offset, "CG");
if (CG == NULL || CG[0] != 'B' || CG[1] != 'I') return false;

const unsigned tag_cigar_len = (unsigned)CG[2] | (unsigned)CG[3]<<8 | (unsigned)CG[4]<<16 | (unsigned)CG[5]<<24;
if (tag_cigar_len == 0) return false;

// recalculate bin, as it may be incorrect if it was calculated by a tool unaware of the real CIGAR in tag
const unsigned tag_cigar_offset = CG - data + 6;
unsigned alignment_end = a.Position;
p = data + tag_cigar_offset;
for (unsigned i = 0; i < tag_cigar_len * 4; i += 4, p += 4) {
unsigned cigar1 = (unsigned)p[0] | (unsigned)p[1]<<8 | (unsigned)p[2]<<16 | (unsigned)p[3]<<24;
int op = cigar1 & 0xf;
if (op == 0 || op == 2 || op == 3 || op == 7 || op == 8)
alignment_end += cigar1 >> 4;
}
a.Bin = hts_reg2bin(a.Position, alignment_end, 14, 5);

// populate new AllCharData
int fake_bytes = a.SupportData.NumCigarOperations * 4;
std::string new_data;
new_data.reserve(data_len - 8 - fake_bytes + 1);
new_data.append((char*)data, a.SupportData.QueryNameLength); // query name
new_data.append((char*)data + tag_cigar_offset, tag_cigar_len * 4); // real CIGAR
new_data.append((char*)data + seq_offset, tag_cigar_offset - 8 - seq_offset); // seq, qual and tags before CG
const unsigned tag_cigar_end_offset = tag_cigar_offset + tag_cigar_len * 4;
if (tag_cigar_end_offset < data_len) // tags after CG, if there is any
new_data.append((char*)data + tag_cigar_end_offset, data_len - tag_cigar_end_offset);

// update member variables
a.SupportData.NumCigarOperations = tag_cigar_len;
a.SupportData.BlockLength -= 8 + fake_bytes;
memcpy(buf.Buffer, new_data.c_str(), buf.NumBytes - 8 - fake_bytes);
return true;
}

// populates BamAlignment with alignment data under file pointer, returns success/fail
bool BamReaderPrivate::LoadNextAlignment(BamAlignment& alignment)
{
Expand Down Expand Up @@ -291,11 +376,15 @@ bool BamReaderPrivate::LoadNextAlignment(BamAlignment& alignment)

// read in character data - make sure proper data size was read
bool readCharDataOK = false;
const unsigned int dataLength = alignment.SupportData.BlockLength - Constants::BAM_CORE_SIZE;
unsigned int dataLength = alignment.SupportData.BlockLength - Constants::BAM_CORE_SIZE;
RaiiBuffer allCharData(dataLength);

if (m_stream.Read(allCharData.Buffer, dataLength) == dataLength) {

int OldNumCigarOperations = alignment.SupportData.NumCigarOperations;
if (Tag2Cigar(alignment, allCharData))
dataLength -= 8 + OldNumCigarOperations * 4;

// store 'allCharData' in supportData structure
alignment.SupportData.AllCharData.assign((const char*)allCharData.Buffer, dataLength);

Expand Down
1 change: 1 addition & 0 deletions src/api/internal/bam/BamReader_p.h
Original file line number Diff line number Diff line change
Expand Up @@ -53,6 +53,7 @@ class BamReaderPrivate
// access alignment data
bool GetNextAlignment(BamAlignment& alignment);
bool GetNextAlignmentCore(BamAlignment& alignment);
bool Tag2Cigar(BamAlignment &alignment, RaiiBuffer &buf);

// access auxiliary data
std::string GetHeaderText() const;
Expand Down
90 changes: 77 additions & 13 deletions src/api/internal/bam/BamWriter_p.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -295,6 +295,8 @@ void BamWriterPrivate::WriteAlignment(const BamAlignment& al)
queryLength + // here referring to quality length
tagDataLength;
unsigned int blockSize = Constants::BAM_CORE_SIZE + dataBlockSize;
if (numCigarOperations >= 65536)
blockSize += 16;
if (m_isBigEndian) BamTools::SwapEndian_32(blockSize);
m_stream.Write((char*)&blockSize, Constants::BAM_SIZEOF_INT);

Expand All @@ -303,7 +305,7 @@ void BamWriterPrivate::WriteAlignment(const BamAlignment& al)
buffer[0] = al.RefID;
buffer[1] = al.Position;
buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | nameLength;
buffer[3] = (al.AlignmentFlag << 16) | numCigarOperations;
buffer[3] = (al.AlignmentFlag << 16) | (numCigarOperations < 65536? numCigarOperations : 2);
buffer[4] = queryLength;
buffer[5] = al.MateRefID;
buffer[6] = al.MatePosition;
Expand All @@ -322,17 +324,29 @@ void BamWriterPrivate::WriteAlignment(const BamAlignment& al)
m_stream.Write(al.Name.c_str(), nameLength);

// write the packed cigar
if (m_isBigEndian) {
char* cigarData = new char[packedCigarLength]();
memcpy(cigarData, packedCigar.data(), packedCigarLength);
if (numCigarOperations < 65536) {
if (m_isBigEndian) {
for (std::size_t i = 0; i < packedCigarLength; ++i)
BamTools::SwapEndian_32p(&cigarData[i]);
char* cigarData = new char[packedCigarLength]();
memcpy(cigarData, packedCigar.data(), packedCigarLength);
if (m_isBigEndian) {
for (size_t i = 0; i < packedCigarLength; ++i)
BamTools::SwapEndian_32p(&cigarData[i]);
}
m_stream.Write(cigarData, packedCigarLength);
delete[] cigarData; // TODO: cleanup on Write exception thrown?
}
m_stream.Write(cigarData, packedCigarLength);
delete[] cigarData; // TODO: cleanup on Write exception thrown?
} else
m_stream.Write(packedCigar.data(), packedCigarLength);
else
m_stream.Write(packedCigar.data(), packedCigarLength);
} else {
unsigned int cigar[2];
cigar[0] = queryLength << 4 | 4;
cigar[1] = (al.GetEndPosition() - al.Position) << 4 | 3;
if (m_isBigEndian) {
BamTools::SwapEndian_32(cigar[0]);
BamTools::SwapEndian_32(cigar[1]);
}
m_stream.Write((char*)cigar, 8);
}

if (queryLength > 0) {

Expand Down Expand Up @@ -450,13 +464,37 @@ void BamWriterPrivate::WriteAlignment(const BamAlignment& al)
delete[] tagData; // TODO: cleanup on Write exception thrown?
} else
m_stream.Write(al.TagData.data(), tagDataLength);

if (numCigarOperations >= 65536) {
m_stream.Write("CGBI", 4);
if ( m_isBigEndian ) {
unsigned int cigar_len_buf = numCigarOperations;
BamTools::SwapEndian_32(cigar_len_buf);
m_stream.Write((char*)&cigar_len_buf, 4);

char* cigarData = new char[packedCigarLength]();
memcpy(cigarData, packedCigar.data(), packedCigarLength);
if ( m_isBigEndian ) {
for ( size_t i = 0; i < packedCigarLength; ++i ) // FIXME: similarly, this should be "i += 4", not "++i"
BamTools::SwapEndian_32p(&cigarData[i]);
}
m_stream.Write(cigarData, packedCigarLength);
delete[] cigarData; // TODO: cleanup on Write exception thrown?
}
else {
m_stream.Write((char*)&numCigarOperations, 4);
m_stream.Write(packedCigar.data(), packedCigarLength);
}
}
}

void BamWriterPrivate::WriteCoreAlignment(const BamAlignment& al)
{

// write the block size
unsigned int blockSize = al.SupportData.BlockLength;
if (al.SupportData.NumCigarOperations >= 65536)
blockSize += 16;
if (m_isBigEndian) BamTools::SwapEndian_32(blockSize);
m_stream.Write((char*)&blockSize, Constants::BAM_SIZEOF_INT);

Expand All @@ -468,7 +506,7 @@ void BamWriterPrivate::WriteCoreAlignment(const BamAlignment& al)
buffer[0] = al.RefID;
buffer[1] = al.Position;
buffer[2] = (alignmentBin << 16) | (al.MapQuality << 8) | al.SupportData.QueryNameLength;
buffer[3] = (al.AlignmentFlag << 16) | al.SupportData.NumCigarOperations;
buffer[3] = (al.AlignmentFlag << 16) | (al.SupportData.NumCigarOperations < 65536? al.SupportData.NumCigarOperations : 2);
buffer[4] = al.SupportData.QuerySequenceLength;
buffer[5] = al.MateRefID;
buffer[6] = al.MatePosition;
Expand All @@ -484,8 +522,34 @@ void BamWriterPrivate::WriteCoreAlignment(const BamAlignment& al)
m_stream.Write((char*)&buffer, Constants::BAM_CORE_SIZE);

// write the raw char data
m_stream.Write((char*)al.SupportData.AllCharData.data(),
al.SupportData.BlockLength - Constants::BAM_CORE_SIZE);
if (al.SupportData.NumCigarOperations < 65536) {
m_stream.Write((char*)al.SupportData.AllCharData.data(),
al.SupportData.BlockLength - Constants::BAM_CORE_SIZE);
} else {
const char *data = al.SupportData.AllCharData.c_str();
const unsigned data_len = al.SupportData.BlockLength - Constants::BAM_CORE_SIZE;
const unsigned cigar_offset = al.SupportData.QueryNameLength;
const unsigned seq_offset = cigar_offset + al.SupportData.NumCigarOperations * 4;
unsigned fake_cigar[2];
fake_cigar[0] = al.SupportData.QuerySequenceLength << 4 | 4;
fake_cigar[1] = (al.GetEndPosition() - al.Position) << 4 | 3;
m_stream.Write(data, al.SupportData.QueryNameLength);
if (m_isBigEndian) {
BamTools::SwapEndian_32(fake_cigar[0]);
BamTools::SwapEndian_32(fake_cigar[1]);
}
m_stream.Write((char*)&fake_cigar, 8);
m_stream.Write(data + seq_offset, data_len - seq_offset);
m_stream.Write("CGBI", 4);
if (m_isBigEndian) {
unsigned cigar_len_buf = al.SupportData.NumCigarOperations;
BamTools::SwapEndian_32(cigar_len_buf);
m_stream.Write((char*)&cigar_len_buf, 4);
} else {
m_stream.Write((char*)&al.SupportData.NumCigarOperations, 4);
}
m_stream.Write(data + cigar_offset, al.SupportData.NumCigarOperations * 4);
}
}

void BamWriterPrivate::WriteMagicNumber()
Expand Down

0 comments on commit 3705ed2

Please sign in to comment.