diff --git a/Jamroot.jam b/Jamroot.jam index 6ed979cb61..0d9290df17 100644 --- a/Jamroot.jam +++ b/Jamroot.jam @@ -49,6 +49,7 @@ ECHO # tarballs - useful after your initial build # # --without-mz5 Build without mz5 support +# --without-mzmlb Build without mzMLb support # --without-agilent Build without Agilent support # --without-bruker Build without Bruker support # --without-sciex Build without Sciex support @@ -152,9 +153,16 @@ if ( "--without-binary-msdata" in [ modules.peek : ARGV ] ) # support only text { echo "NOTICE: building without support for binary msdata formats as requested" ; } -else if ( "--without-mz5" in [ modules.peek : ARGV ] ) +else { - echo "NOTICE: building without mz5 support as requested" ; + if ( "--without-mz5" in [ modules.peek : ARGV ] ) + { + echo "NOTICE: building without mz5 support as requested" ; + } + if ( "--without-mzmlb" in [ modules.peek : ARGV ] ) + { + echo "NOTICE: building without mzMLb support as requested" ; + } } # do we want to skip mz5 support? @@ -170,6 +178,29 @@ rule without-mz5 ( properties * ) } } +# do we want to skip mzMLb support? +rule without-mzmlb ( properties * ) +{ + if --without-mzmlb in [ modules.peek : ARGV ] + { + return without-mzmlb WITHOUT_MZMLB ; + } + else + { + return [ without-binary-msdata $(properties) ] ; + } +} + +# do we want to skip all binary file formats (vendors and mz5)? +rule without-binary-msdata ( properties * ) +{ + if --without-binary-msdata in [ modules.peek : ARGV ] + { + return without-binary-msdata WITHOUT_MZ5 WITHOUT_MZMLB off ; + } +} + + # do we want to skip all binary file formats (vendors and mz5)? rule without-binary-msdata ( properties * ) { @@ -259,6 +290,16 @@ rule mz5-build ( itemname ) } +# return the itemname if mzmlb build is on (the default) +rule mzmlb-build ( itemname ) +{ + if ( ! [ without-mzmlb ] ) + { + return $(itemname) ; # do build mzmlb + } +} + + if ! [ modules.peek : NT ] { # make msbuild targets a no-op diff --git a/pwiz.vcxproj b/pwiz.vcxproj index 69d78a3300..2ee9717ebb 100644 --- a/pwiz.vcxproj +++ b/pwiz.vcxproj @@ -25,7 +25,7 @@ Application $(SolutionDir)build-nt-x86\msvc-release - v141 + v143 MultiByte true @@ -33,7 +33,9 @@ v143 Application MultiByte - true + + + v143 @@ -66,6 +68,9 @@ $(SolutionDir);$(SolutionDir)libraries\boost_aux;$(SolutionDir)libraries\boost_1_76_0;$(SolutionDir)libraries\SQLite;$(SolutionDir)libraries\CSpline;$(SolutionDir)libraries\Eigen;$(IncludePath) $(SolutionDir);$(SolutionDir)libraries\boost_aux;$(SolutionDir)libraries\boost_1_76_0;$(SolutionDir)libraries\SQLite;$(SolutionDir)libraries\CSpline;$(SolutionDir)libraries\Eigen;$(IncludePath) + + true + @@ -76,8 +81,8 @@ - PWIZ_READER_THERMO;PWIZ_READER_AGILENT;PWIZ_READER_BRUKER;PWIZ_READER_WATERS;PWIZ_READER_ABI;PWIZ_READER_ABI_T2D;PWIZ_READER_SHIMADZU;PWIZ_READER_UNIFI;WIN32;USE_RAW_PTR;PWIZ_READER_BRUKER_WITH_COMPASSXTRACT;PWIZ_READER_UNIFI;PWIZ_READER_UIMF;%(PreprocessorDefinitions) - true + PWIZ_READER_THERMO;PWIZ_READER_AGILENT;PWIZ_READER_BRUKER;PWIZ_READER_WATERS;PWIZ_READER_ABI;PWIZ_READER_ABI_T2D;PWIZ_READER_SHIMADZU;PWIZ_READER_UNIFI;WIN32;PWIZ_READER_BRUKER_WITH_COMPASSXTRACT;PWIZ_READER_UNIFI;PWIZ_READER_UIMF;%(PreprocessorDefinitions) + true @@ -120,7 +125,6 @@ - @@ -286,9 +290,7 @@ - - @@ -342,6 +344,7 @@ + @@ -852,6 +855,7 @@ ThermoFisher.CommonCore.RawFileReader.dll;ThermoFisher.CommonCore.Data.dll pwiz_aux\msrc\utility\vendor_api\thermo + true @@ -859,7 +863,9 @@ - + + Level1 + @@ -960,6 +966,7 @@ + diff --git a/pwiz.vcxproj.filters b/pwiz.vcxproj.filters index c23333035c..ec83a5e621 100644 --- a/pwiz.vcxproj.filters +++ b/pwiz.vcxproj.filters @@ -153,9 +153,7 @@ - - @@ -526,7 +524,6 @@ - @@ -593,6 +590,7 @@ + @@ -1066,6 +1064,7 @@ + diff --git a/pwiz/data/common/diff_std.cpp b/pwiz/data/common/diff_std.cpp index 060ccc84af..e113e505a2 100644 --- a/pwiz/data/common/diff_std.cpp +++ b/pwiz/data/common/diff_std.cpp @@ -156,7 +156,7 @@ void diff(const CVParam& a, // lexical_cast is happy to read "1.1" as "1" - and "1.9" the same way if ((std::string::npos == a.value.find_first_of(".eE")) && (std::string::npos == b.value.find_first_of(".eE"))) // any float-like chars? - { + { bool successA, successB; // compare as ints if possible int ia = lexical_cast(a.value, successA); @@ -165,7 +165,7 @@ void diff(const CVParam& a, asString = true; else { - if (ia != ib) + if (ia != ib) { a_b.value = lexical_cast(ia); b_a.value = lexical_cast(ib); @@ -174,7 +174,7 @@ void diff(const CVParam& a, { if ((std::string::npos == a.value.find_first_not_of("0123456789")) && (std::string::npos == b.value.find_first_not_of("0123456789"))) - { + { a_b.value.clear(); b_a.value.clear(); } @@ -185,7 +185,8 @@ void diff(const CVParam& a, } } } - else + else if (std::string::npos == a.value.find_first_not_of("01234567890.e-") && + std::string::npos == b.value.find_first_not_of("01234567890.e-")) { // use precision to compare floating point values bool successA, successB; @@ -209,6 +210,9 @@ void diff(const CVParam& a, } } } + else + asString = true; + if (asString) { diff_string(a.value, b.value, a_b.value, b_a.value); diff --git a/pwiz/data/msdata/BinaryDataEncoder.cpp b/pwiz/data/msdata/BinaryDataEncoder.cpp index abbd88c20a..ef8d04620f 100644 --- a/pwiz/data/msdata/BinaryDataEncoder.cpp +++ b/pwiz/data/msdata/BinaryDataEncoder.cpp @@ -233,7 +233,10 @@ void BinaryDataEncoder::Impl::encode(const double* data, size_t dataSize, std::s } } if (n>=0) + { config_.numpress = Numpress_None; // excessive error, don't numpress + if (config_.format == Format_MzMLb) return; + } else byteBuffer = reinterpret_cast(&numpressed[0]); } catch (int e) { @@ -311,27 +314,52 @@ void BinaryDataEncoder::Impl::encode(const double* data, size_t dataSize, std::s } } - // Base64 encoding - - result.resize(Base64::binaryToTextSize(byteCount)); - - // std::string storage is not guaranteed contiguous in older C++ standards, - // and on long strings this has caused problems in the wild. So test for - // actual contiguousness, and fall back to std::vector if needed - // thx Johan Teleman - size_t textSize; - char *first = &result[0]; - char *last = &result[result.size()-1]; - if ((int)result.size() == 1+(last-first)) // pointer math agrees with [] operator - textSize = Base64::binaryToText(byteBuffer, byteCount, &result[0]); - else + if (config_.format == Format_MzMLb) { - std::vector contig; // work in this contiguous memory then copy to string - contig.resize(result.size()); - textSize = Base64::binaryToText(byteBuffer, byteCount, &contig[0]); - copy(contig.begin(), contig.end(), result.begin()); + // no base64 encoding as storing as binary in HDF5 + + result.resize(byteCount); + + // std::string storage is not guaranteed contiguous in older C++ standards, + // and on long strings this has caused problems in the wild. So test for + // actual contiguousness, and fall back to std::vector if needed + // thx Johan Teleman + char *first = &result[0]; + char *last = &result[result.size() - 1]; + if ((int)result.size() == 1 + (last - first)) // pointer math agrees with [] operator + memcpy(&result[0], byteBuffer, byteCount); + else + { + std::vector contig; // work in this contiguous memory then copy to string + contig.resize(result.size()); + memcpy(&contig[0], byteBuffer, byteCount); + copy(contig.begin(), contig.end(), result.begin()); + } + } + else + { + // Base64 encoding + + result.resize(Base64::binaryToTextSize(byteCount)); + + // std::string storage is not guaranteed contiguous in older C++ standards, + // and on long strings this has caused problems in the wild. So test for + // actual contiguousness, and fall back to std::vector if needed + // thx Johan Teleman + size_t textSize; + char *first = &result[0]; + char *last = &result[result.size() - 1]; + if ((int)result.size() == 1 + (last - first)) // pointer math agrees with [] operator + textSize = Base64::binaryToText(byteBuffer, byteCount, &result[0]); + else + { + std::vector contig; // work in this contiguous memory then copy to string + contig.resize(result.size()); + textSize = Base64::binaryToText(byteBuffer, byteCount, &contig[0]); + copy(contig.begin(), contig.end(), result.begin()); + } + result.resize(textSize); } - result.resize(textSize); if (binaryByteCount != NULL) *binaryByteCount = byteCount; // size before base64 encoding @@ -467,18 +495,30 @@ void BinaryDataEncoder::Impl::decode(const char *encodedData, size_t length, pwi if (!encodedData || !length) return; - // Base64 decoding - - vector binary(Base64::textToBinarySize(length)); - size_t binarySize = Base64::textToBinary(encodedData, length, &binary[0]); - binary.resize(binarySize); - // buffer abstractions - void* byteBuffer = &binary[0]; - size_t byteCount = binarySize; + vector binary; + void* byteBuffer; + size_t byteCount; size_t initialSize; + if (config_.format == Format_MzMLb) + { + byteBuffer = (void*) encodedData; + byteCount = length; + } + else + { + // Base64 decoding + + binary.resize(Base64::textToBinarySize(length)); + size_t binarySize = Base64::textToBinary(encodedData, length, &binary[0]); + binary.resize(binarySize); + + byteBuffer = &binary[0]; + byteCount = binarySize; + } + // decompression vector decompressed; diff --git a/pwiz/data/msdata/BinaryDataEncoder.hpp b/pwiz/data/msdata/BinaryDataEncoder.hpp index 696afa3594..7a0dc80218 100644 --- a/pwiz/data/msdata/BinaryDataEncoder.hpp +++ b/pwiz/data/msdata/BinaryDataEncoder.hpp @@ -51,6 +51,10 @@ class PWIZ_API_DECL BinaryDataEncoder enum Compression {Compression_None, Compression_Zlib}; enum Numpress {Numpress_None, Numpress_Linear, Numpress_Pic, Numpress_Slof}; // lossy numerical representations + enum Prediction {Prediction_None, Prediction_Delta, Prediction_Linear}; + enum Format {Format_MzML, Format_MzMLb}; // if Format_MzMLb, we do not base64 encode, endianise or compress + enum Type {Type_None, Type_Spectrum, Type_Chromatogram}; // mzMLb needs to know whether this BinaryDataArray is a spectrum or chromatogram (for HDF5 dataset name) [*this not essential*] + /// encoding/decoding configuration struct PWIZ_API_DECL Config { @@ -63,8 +67,15 @@ class PWIZ_API_DECL BinaryDataEncoder double numpressSlofErrorTolerance; // guarantee abs(1.0-(encoded/decoded)) <= this, 0=do not guarantee anything double numpressLinearAbsMassAcc; // absolute mass error for lossy linear compression in Th (e.g. use 1e-4 for 1ppm @ 100 Th) + Prediction prediction; + Format format; + Type type; + int truncation; // how many bits mantissa to truncate (and hence not store) + std::map precisionOverrides; - std::map numpressOverrides; + std::map numpressOverrides; + std::map predictionOverrides; + std::map truncationOverrides; Config() : precision(Precision_64), @@ -74,7 +85,11 @@ class PWIZ_API_DECL BinaryDataEncoder numpressFixedPoint(0.0), numpressLinearErrorTolerance(BinaryDataEncoder_default_numpressLinearErrorTolerance), numpressSlofErrorTolerance(BinaryDataEncoder_default_numpressSlofErrorTolerance), - numpressLinearAbsMassAcc(-1.0) + numpressLinearAbsMassAcc(-1.0), + prediction(Prediction_None), + format(Format_MzML), + type(Type_None), + truncation(0) {} }; diff --git a/pwiz/data/msdata/DefaultReaderList.cpp b/pwiz/data/msdata/DefaultReaderList.cpp index 1913f80763..fc868aeca9 100755 --- a/pwiz/data/msdata/DefaultReaderList.cpp +++ b/pwiz/data/msdata/DefaultReaderList.cpp @@ -41,6 +41,10 @@ #include "References.hpp" #include "ChromatogramListBase.hpp" #include "pwiz/data/msdata/Version.hpp" +#ifndef WITHOUT_MZMLB +#include "mzmlb/Connection_mzMLb.hpp" +using namespace pwiz::msdata::mzmlb; +#endif #ifndef WITHOUT_MZ5 #include "mz5/Connection_mz5.hpp" #endif @@ -488,6 +492,87 @@ PWIZ_API_DECL void Reader_mz5::read(const std::string& filename, read(filename, head, *results.back()); } +// +// Reader_mzMLb +// + +// This version only checks whether the file is a HDF5 file. +namespace { + + const char mzMLbHeader[] = { '\x89', '\x48', '\x44', '\x46', '\x0d', '\x0a', '\x1a', '\x0a' }; + const size_t mzMLbHeaderSize = sizeof(mzMLbHeader) / sizeof(char); + +} // namespace + +PWIZ_API_DECL std::string Reader_mzMLb::identify(const string& filename, const string& head) const +{ + if (head.length() < mzMLbHeaderSize) + return ""; + + for (size_t i = 0; i < mzMLbHeaderSize; ++i) + if (head[i] != mzMLbHeader[i]) + return ""; + + try + { +#ifndef WITHOUT_MZMLB + Connection_mzMLb con(filename, true); +#endif + return getType(); + } + catch (std::runtime_error&) + { + return ""; + } + + return ""; +} + + +PWIZ_API_DECL void Reader_mzMLb::read(const std::string& filename, + const std::string& head, + MSData& result, + int runIndex, + const Config& config) const +{ +#ifdef WITHOUT_MZMLB + throw ReaderFail("[Reader_mzMLb::read] library was not built with mzMLb support."); +#else + if (runIndex != 0) + throw ReaderFail("[Reader_mzML::read] multiple runs not supported"); + + Connection_mzMLb con(filename); + shared_ptr is(new boost::iostreams::stream(con)); + if (!is.get() || !*is) + throw runtime_error(("[Reader_mzML::read] Unable to open file " + filename).c_str()); + + string rootElement = xml_root_element(*is); + if (rootElement == "mzML") + { + Serializer_mzML::Config config; + config.indexed = false; + Serializer_mzML serializer(config); + serializer.read(is, result); + } + else if (rootElement == "indexedmzML") + { + Serializer_mzML serializer; + serializer.read(is, result); + } + + fillInCommonMetadata(filename, result); +#endif +} + +PWIZ_API_DECL void Reader_mzMLb::read(const std::string& filename, + const std::string& head, + std::vector& results, + const Config& config) const +{ + results.push_back(MSDataPtr(new MSData)); + read(filename, head, *results.back(), 0, config); +} + /// default Reader list PWIZ_API_DECL DefaultReaderList::DefaultReaderList() @@ -498,6 +583,7 @@ PWIZ_API_DECL DefaultReaderList::DefaultReaderList() emplace_back(new Reader_MS1); emplace_back(new Reader_MS2); emplace_back(new Reader_BTDX); + emplace_back(new Reader_mzMLb); emplace_back(new Reader_mz5); } diff --git a/pwiz/data/msdata/DefaultReaderList.hpp b/pwiz/data/msdata/DefaultReaderList.hpp index 194fcc3f90..55181e87e5 100755 --- a/pwiz/data/msdata/DefaultReaderList.hpp +++ b/pwiz/data/msdata/DefaultReaderList.hpp @@ -126,6 +126,18 @@ class PWIZ_API_DECL Reader_mz5 : public Reader }; +class PWIZ_API_DECL Reader_mzMLb : public Reader +{ + public: + virtual std::string identify(const std::string& filename, const std::string& head) const; + virtual void read(const std::string& filename, const std::string& head, MSData& result, int runIndex = 0, const Config& config = Config()) const; + virtual void read(const std::string& filename, const std::string& head, std::vector& results, const Config& config = Config()) const; + virtual const char* getType() const {return "mzMLb";} + virtual CVID getCvType() const {return MS_mzMLb_format;} + virtual std::vector getFileExtensions() const {return {".mzMLb"};} +}; + + /// default Reader list class PWIZ_API_DECL DefaultReaderList : public ReaderList { diff --git a/pwiz/data/msdata/IO.cpp b/pwiz/data/msdata/IO.cpp index c2b6b82500..aa8c34578f 100644 --- a/pwiz/data/msdata/IO.cpp +++ b/pwiz/data/msdata/IO.cpp @@ -29,6 +29,10 @@ #include "pwiz/utility/misc/Filesystem.hpp" #include "pwiz/utility/misc/Std.hpp" #include "SpectrumWorkerThreads.hpp" +#ifndef WITHOUT_MZMLB +#include "mzmlb/Connection_mzMLb.hpp" +using namespace pwiz::msdata::mzmlb; +#endif namespace pwiz { namespace msdata { @@ -1614,59 +1618,303 @@ PWIZ_API_DECL void read(std::istream& is, ScanList& scanList) // BinaryData // +template +void writeMzMLbExtra(stream* mzMLb_os, string& dataset, size_t& offset, const string& encoded, XMLWriter& writer, const BinaryDataArrayType& binaryDataArray, const BinaryDataEncoder::Config& usedConfig) +{ +} + +template <> +void writeMzMLbExtra(stream* mzMLb_os, string& dataset, size_t& offset, const string& encoded, XMLWriter& writer, const BinaryDataArray& binaryDataArray, const BinaryDataEncoder::Config& usedConfig) +{ +#ifndef WITHOUT_MZMLB + + size_t encoded_size = encoded.size(); + + // mzMLb, including truncation and prediction, from andrew.dowsey@bristol.ac.uk + if (mzMLb_os) + { + dataset = (usedConfig.type == BinaryDataEncoder::Type_Spectrum ? "spectrum_" : (usedConfig.type == BinaryDataEncoder::Type_Chromatogram ? "chromatogram_" : "")); + dataset += cvTermInfo(binaryDataArray.cvParamChild(MS_binary_data_array).cvid).id; + replace(dataset.begin(), dataset.end(), ':', '_'); + + if (usedConfig.numpress != BinaryDataEncoder::Numpress_None) + { + if (usedConfig.numpress == BinaryDataEncoder::Numpress_Linear) dataset += "_numpress_linear"; + else if (usedConfig.numpress == BinaryDataEncoder::Numpress_Pic) dataset += "_numpress_pic"; + else dataset += "_numpress_slof"; + + offset = (*mzMLb_os)->seek(dataset, 0, std::ios_base::cur); + (*mzMLb_os)->write_opaque(dataset, (const unsigned char*)&encoded[0], encoded_size); + } + else if (usedConfig.precision == BinaryDataEncoder::Precision_32) + { + vector float_data(binaryDataArray.data.size()); + + if (usedConfig.truncation == 0 && usedConfig.prediction == BinaryDataEncoder::Prediction_None) + { + for (size_t i = 0; i < binaryDataArray.data.size(); i++) float_data[i] = (float)binaryDataArray.data[i]; + } + else + { + // truncation + if (usedConfig.truncation > 0) + { + union { unsigned long ix; float fx; } v; + unsigned long bitmask = ~(((unsigned long)1 << usedConfig.truncation) - 1); + for (size_t i = 0; i < binaryDataArray.data.size(); i++) + { + v.fx = (float)binaryDataArray.data[i]; + v.ix &= bitmask; + float_data[i] = v.fx; + } + } + else if (usedConfig.truncation == -1) + { + for (size_t i = 0; i < binaryDataArray.data.size(); i++) + { + float_data[i] = round((float)binaryDataArray.data[i]); + } + } + else + { + for (size_t i = 0; i < binaryDataArray.data.size(); i++) + { + float_data[i] = (float)binaryDataArray.data[i]; + } + } + + // prediction + switch (usedConfig.prediction) + { + case BinaryDataEncoder::Prediction_Delta: + if (float_data.size() > 0) + { + float previous = float_data[0]; + for (int i = 1; i < float_data.size(); i++) + { + // encoding + float_data[i] = float_data[0] + float_data[i] - previous; + + // decoding + previous = float_data[i] + previous - float_data[0]; + } + } + break; + case BinaryDataEncoder::Prediction_Linear: + if (float_data.size() > 0) + { + float previous2 = float_data[0]; + + if (float_data.size() > 1) + { + float previous1 = float_data[1]; + + for (int i = 2; i < float_data.size(); i++) + { + // encoding + float_data[i] = float_data[1] + float_data[i] - 2.0 * previous1 + previous2; + + // decoding + float t = previous1; + previous1 = float_data[i] + 2.0f * previous1 - previous2 - float_data[1]; + previous2 = t; + } + } + } + break; + case BinaryDataEncoder::Prediction_None: + break; + } + + } + + dataset += "_float"; + offset = (*mzMLb_os)->seek(dataset, 0, std::ios_base::cur); + encoded_size = float_data.size() * sizeof(float); + (*mzMLb_os)->write(dataset, &float_data[0], float_data.size()); + } + else + { + vector double_data; + + // truncation + if (usedConfig.truncation > 0) + { + double_data.resize(binaryDataArray.data.size()); + + union { unsigned long long ix; double fx; } v; + unsigned long long bitmask = ~(((unsigned long long)1 << usedConfig.truncation) - 1); + for (size_t i = 0; i < binaryDataArray.data.size(); i++) + { + v.fx = binaryDataArray.data[i]; + v.ix &= bitmask; + double_data[i] = v.fx; + } + } + else if (usedConfig.truncation == -1) + { + double_data.resize(binaryDataArray.data.size()); + for (size_t i = 0; i < binaryDataArray.data.size(); i++) + { + double_data[i] = round(binaryDataArray.data[i]); + } + } + else + { + double_data = binaryDataArray.data; + } + + // prediction + switch (usedConfig.prediction) + { + case BinaryDataEncoder::Prediction_Delta: + if (double_data.size() > 0) + { + double previous = double_data[0]; + for (int i = 1; i < double_data.size(); i++) + { + // encoding + double_data[i] = double_data[0] + double_data[i] - previous; + + // decoding + previous = double_data[i] + previous - double_data[0]; + } + } + break; + case BinaryDataEncoder::Prediction_Linear: + if (double_data.size() > 0) + { + double previous2 = double_data[0]; + + if (double_data.size() > 1) + { + double previous1 = double_data[1]; + + for (int i = 2; i < double_data.size(); i++) + { + // encoding + double_data[i] = double_data[1] + double_data[i] - 2.0 * previous1 + previous2; + + // decoding + double t = previous1; + previous1 = double_data[i] + 2.0 * previous1 - previous2 - double_data[1]; + previous2 = t; + } + } + } + break; + case BinaryDataEncoder::Prediction_None: + break; + } + + dataset += "_double"; + offset = (*mzMLb_os)->seek(dataset, 0, std::ios_base::cur); + encoded_size = double_data.size() * sizeof(double); + (*mzMLb_os)->write(dataset, &double_data[0], double_data.size()); + } + } +#endif +} + + template void writeBinaryDataArray(minimxml::XMLWriter& writer, const BinaryDataArrayType& binaryDataArray, const BinaryDataEncoder::Config& config) { BinaryDataEncoder::Config usedConfig = config; - map::const_iterator overrideItr = config.precisionOverrides.find(binaryDataArray.cvParamChild(MS_binary_data_array).cvid); + auto overrideItr = config.precisionOverrides.find(binaryDataArray.cvParamChild(MS_binary_data_array).cvid); if (overrideItr != config.precisionOverrides.end()) usedConfig.precision = overrideItr->second; - map::const_iterator n_overrideItr = config.numpressOverrides.find(binaryDataArray.cvParamChild(MS_binary_data_array).cvid); + auto n_overrideItr = config.numpressOverrides.find(binaryDataArray.cvParamChild(MS_binary_data_array).cvid); if (n_overrideItr != config.numpressOverrides.end()) usedConfig.numpress = n_overrideItr->second; + auto t_overrideItr = config.truncationOverrides.find(binaryDataArray.cvParamChild(MS_binary_data_array).cvid); + if (t_overrideItr != config.truncationOverrides.end()) + usedConfig.truncation = t_overrideItr->second; + + string encoded; - BinaryDataEncoder encoder(usedConfig); - string encoded; - encoder.encode(binaryDataArray.data, encoded); - usedConfig = encoder.getConfig(); // config may have changed if numpress error was excessive +#ifndef WITHOUT_MZMLB + stream* mzMLb_os = dynamic_cast*>(&writer.getOutputStream()); + if (mzMLb_os) + { + usedConfig.format = BinaryDataEncoder::Format_MzMLb; + + auto p_overrideItr = config.predictionOverrides.find(binaryDataArray.cvParamChild(MS_binary_data_array).cvid); + if (p_overrideItr != config.predictionOverrides.end()) + usedConfig.prediction = p_overrideItr->second; + } + + // If writing into mzML or using any form of numpress, we still need to encode the data to a byte stream + if (!mzMLb_os || usedConfig.numpress != BinaryDataEncoder::Numpress_None) +#endif + { + BinaryDataEncoder encoder(usedConfig); + encoder.encode(binaryDataArray.data, encoded); + usedConfig = encoder.getConfig(); // config may have changed if numpress error was excessive + } + XMLWriter::Attributes attributes; + + size_t encoded_size = encoded.size(); + + string dataset; + size_t offset; + writeMzMLbExtra(mzMLb_os, dataset, offset, encoded, writer, binaryDataArray, usedConfig); // primary array types can never override the default array length if (!binaryDataArray.hasCVParam(MS_m_z_array) && !binaryDataArray.hasCVParam(MS_time_array) && !binaryDataArray.hasCVParam(MS_intensity_array)) { - attributes.add("arrayLength", binaryDataArray.data.size()); +#ifndef WITHOUT_MZMLB + if (mzMLb_os) + { + attributes.add("arrayLength", 0); + } + else +#endif + { + attributes.add("arrayLength", binaryDataArray.data.size()); + } } - attributes.add("encodedLength", encoded.size()); + attributes.add("encodedLength", encoded_size); + if (binaryDataArray.dataProcessingPtr.get()) attributes.add("dataProcessingRef", encode_xml_id_copy(binaryDataArray.dataProcessingPtr->id)); writer.startElement("binaryDataArray", attributes); + bool isIntegerArray = typeid(BinaryDataArrayType) == typeid(IntegerDataArray); + if (BinaryDataEncoder::Numpress_None == usedConfig.numpress) { if (usedConfig.precision == BinaryDataEncoder::Precision_32) - write(writer, typeid(BinaryDataArrayType) == typeid(IntegerDataArray) ? MS_32_bit_integer : MS_32_bit_float); + write(writer, isIntegerArray ? MS_32_bit_integer : MS_32_bit_float); else - write(writer, typeid(BinaryDataArrayType) == typeid(IntegerDataArray) ? MS_64_bit_integer : MS_64_bit_float); + write(writer, isIntegerArray ? MS_64_bit_integer : MS_64_bit_float); } if (usedConfig.byteOrder == BinaryDataEncoder::ByteOrder_BigEndian) throw runtime_error("[IO::write()] mzML: must use little endian encoding."); - bool zlib = false; // Handle numpress+zlib + bool zlib = false; // Handle numpress+zlib + + if (usedConfig.prediction == BinaryDataEncoder::Prediction_Linear) + write(writer, MS_truncation__linear_prediction_and_zlib_compression); + else if (usedConfig.prediction == BinaryDataEncoder::Prediction_Delta) + write(writer, MS_truncation__delta_prediction_and_zlib_compression); + switch (usedConfig.compression) { case BinaryDataEncoder::Compression_None: if (BinaryDataEncoder::Numpress_None == usedConfig.numpress) write(writer, MS_no_compression); break; case BinaryDataEncoder::Compression_Zlib: - zlib = true; - if (BinaryDataEncoder::Numpress_None == usedConfig.numpress) - write(writer, MS_zlib_compression); + zlib = true; + if (BinaryDataEncoder::Numpress_None == usedConfig.numpress) + write(writer, MS_zlib_compression); break; default: throw runtime_error("[IO::write()] Unsupported compression method."); @@ -1674,16 +1922,16 @@ void writeBinaryDataArray(minimxml::XMLWriter& writer, const BinaryDataArrayType } switch (usedConfig.numpress) { case BinaryDataEncoder::Numpress_Linear: - write(writer, MS_32_bit_float); // This should actually be ignored by any reader since numpress defines word size and format, but it makes output standards-compliant and is pretty close to true anyway - write(writer, zlib ? MS_MS_Numpress_linear_prediction_compression_followed_by_zlib_compression : MS_MS_Numpress_linear_prediction_compression); + write(writer, isIntegerArray ? MS_32_bit_integer : MS_32_bit_float); // This should actually be ignored by any reader since numpress defines word size and format, but it makes output standards-compliant and is pretty close to true anyway + write(writer, zlib ? MS_MS_Numpress_linear_prediction_compression_followed_by_zlib_compression : MS_MS_Numpress_linear_prediction_compression); break; case BinaryDataEncoder::Numpress_Pic: - write(writer, MS_32_bit_integer); // This should actually be ignored by any reader since numpress defines word size and format, but it makes output standards-compliant and is pretty close to true anyway - write(writer, zlib ? MS_MS_Numpress_positive_integer_compression_followed_by_zlib_compression : MS_MS_Numpress_positive_integer_compression); + write(writer, isIntegerArray ? MS_32_bit_integer : MS_32_bit_float); // This should actually be ignored by any reader since numpress defines word size and format, but it makes output standards-compliant and is pretty close to true anyway + write(writer, zlib ? MS_MS_Numpress_positive_integer_compression_followed_by_zlib_compression : MS_MS_Numpress_positive_integer_compression); break; case BinaryDataEncoder::Numpress_Slof: - write(writer, MS_32_bit_float); // This should actually be ignored by any reader since numpress defines word size and format, but it makes output standards-compliant and is pretty close to true anyway - write(writer, zlib ? MS_MS_Numpress_short_logged_float_compression_followed_by_zlib_compression : MS_MS_Numpress_short_logged_float_compression); + write(writer, isIntegerArray ? MS_32_bit_integer : MS_32_bit_float); // This should actually be ignored by any reader since numpress defines word size and format, but it makes output standards-compliant and is pretty close to true anyway + write(writer, zlib ? MS_MS_Numpress_short_logged_float_compression_followed_by_zlib_compression : MS_MS_Numpress_short_logged_float_compression); break; case BinaryDataEncoder::Numpress_None: break; @@ -1692,13 +1940,31 @@ void writeBinaryDataArray(minimxml::XMLWriter& writer, const BinaryDataArrayType break; } +#ifndef WITHOUT_MZMLB + if (mzMLb_os) + { + write(writer, CVParam(MS_external_HDF5_dataset, dataset)); + write(writer, CVParam(MS_external_array_length, toString(binaryDataArray.data.size()))); + write(writer, CVParam(MS_external_offset, toString(offset))); + } +#endif + writeParamContainer(writer, binaryDataArray); - writer.pushStyle(XMLWriter::StyleFlag_InlineInner); - writer.startElement("binary"); - writer.characters(encoded, false); // base64 doesn't use any reserved characters - writer.endElement(); - writer.popStyle(); +#ifndef WITHOUT_MZMLB + if (mzMLb_os) + { + writer.startElement("binary", XMLWriter::Attributes(), XMLWriter::EmptyElement); + } + else +#endif + { + writer.pushStyle(XMLWriter::StyleFlag_InlineInner); + writer.startElement("binary"); + writer.characters(encoded, false); // base64 doesn't use any reserved characters + writer.endElement(); + writer.popStyle(); + } writer.endElement(); } @@ -1722,7 +1988,7 @@ struct HandlerBinaryDataArray : public HandlerParamContainer std::vector* binaryDataArrayPtrs; std::vector* integerDataArrayPtrs; const MSData* msd; - size_t defaultArrayLength; + size_t* defaultArrayLength; // make pointer so mzMLb can update defaultArrayLength BinaryDataEncoder::Config config; ParamContainer paramContainer; DataProcessingPtr dataProcessingPtr; @@ -1731,14 +1997,15 @@ struct HandlerBinaryDataArray : public HandlerParamContainer BinaryDataArray* binaryDataArray; IntegerDataArray* integerDataArray; - HandlerBinaryDataArray(std::vector* binaryDataArrayPtrs = 0, std::vector* integerDataArrayPtrs = 0, const MSData* _msd = 0, BinaryDataFlag binaryDataFlag = ReadBinaryData) + HandlerBinaryDataArray(std::vector* binaryDataArrayPtrs = 0, std::vector* integerDataArrayPtrs = 0, const MSData* _msd = 0, std::istream* _is = 0, BinaryDataFlag binaryDataFlag = ReadBinaryData) : binaryDataArrayPtrs(binaryDataArrayPtrs), integerDataArrayPtrs(integerDataArrayPtrs), msd(_msd), defaultArrayLength(0), binaryDataFlag(binaryDataFlag), arrayLength_(0), - encodedLength_(0) + encodedLength_(0), + is_(_is) { parseCharacters = true; autoUnescapeCharacters = false; @@ -1763,12 +2030,15 @@ struct HandlerBinaryDataArray : public HandlerParamContainer dataProcessingPtr.reset(); getAttribute(attributes, "encodedLength", encodedLength_, NoXMLUnescape); - getAttribute(attributes, "arrayLength", arrayLength_, NoXMLUnescape, defaultArrayLength); + if (defaultArrayLength) + getAttribute(attributes, "arrayLength", arrayLength_, NoXMLUnescape, *defaultArrayLength); + else + getAttribute(attributes, "arrayLength", arrayLength_, NoXMLUnescape); return Status::Ok; } else if (name == "binary") - { + { if (msd) References::resolve(paramContainer, *msd); config = getConfig(); @@ -1801,6 +2071,7 @@ struct HandlerBinaryDataArray : public HandlerParamContainer swap(static_cast(*binaryDataArray), paramContainer); binaryDataArray->dataProcessingPtr = dataProcessingPtr; this->binaryDataArray = &*binaryDataArrayPtrs->back(); + readMzMLbBinaryDataArray(this->binaryDataArray); } break; @@ -1826,6 +2097,7 @@ struct HandlerBinaryDataArray : public HandlerParamContainer swap(static_cast(*binaryDataArray), paramContainer); binaryDataArray->dataProcessingPtr = dataProcessingPtr; this->integerDataArray = &*integerDataArrayPtrs->back(); + readMzMLbBinaryDataArray(this->integerDataArray); } break; @@ -1834,6 +2106,7 @@ struct HandlerBinaryDataArray : public HandlerParamContainer throw runtime_error("[IO::HandlerBinaryDataArray] Unknown binary data type."); } + return Status::Ok; } } // end if not cvParam @@ -1857,6 +2130,7 @@ struct HandlerBinaryDataArray : public HandlerParamContainer case MS_64_bit_float: { encoder.decode(text.c_str(), text.length(), binaryDataArray->data); + predict(); if (binaryDataArray->data.size() != arrayLength_) throw runtime_error((format("[IO::HandlerBinaryDataArray] At position %d: expected array of size %d, but decoded array is actually size %d.") @@ -1890,6 +2164,10 @@ struct HandlerBinaryDataArray : public HandlerParamContainer size_t arrayLength_; size_t encodedLength_; + std::string external_dataset_; + size_t external_offset_; + + std::istream* is_; CVID extractCVParam(ParamContainer& container, CVID cvid) { @@ -1915,6 +2193,30 @@ struct HandlerBinaryDataArray : public HandlerParamContainer return result; } + template + bool extractCVParamValue(ParamContainer& container, CVID cvid, ValueT& value) + { + vector& params = container.cvParams; + vector::iterator it = find_if(params.begin(), params.end(), + CVParamIsChildOf(cvid)); + + if (it != params.end()) + { + // found the cvid in container -- erase the CVParam + value = lexical_cast(it->value); + params.erase(it); + } + else + { + // didn't find it -- search recursively, but don't erase anything + CVParam temp = container.cvParamChild(cvid); + if (temp.empty()) + return false; + value = lexical_cast(temp.value); + } + return true; + } + void extractCVParams(ParamContainer& container, CVID cvid, vector &results) { vector& params = container.cvParams; @@ -1932,6 +2234,35 @@ struct HandlerBinaryDataArray : public HandlerParamContainer results.push_back(cvParam.cvid); } + bool extractUserParam(ParamContainer& container, string name) + { + vector& params = container.userParams; + for (vector::iterator it = params.begin(); it != params.end(); it++) + { + if (it->name == name) + { + params.erase(it); + return true; + } + } + return false; + } + + bool extractUserParam(ParamContainer& container, string name, string& value) + { + vector& params = container.userParams; + for (vector::iterator it = params.begin(); it != params.end(); it++) + { + if (it->name == name) + { + value = it->value; + params.erase(it); + return true; + } + } + return false; + } + BinaryDataEncoder::Config getConfig() { BinaryDataEncoder::Config config; @@ -1942,6 +2273,14 @@ struct HandlerBinaryDataArray : public HandlerParamContainer // and remove them from the BinaryDataArray struct (extractCVParam does the removal). // + // prediction + bool prediction_linear = extractCVParam(paramContainer, MS_truncation__linear_prediction_and_zlib_compression) != CVID_Unknown; + bool prediction_delta = extractCVParam(paramContainer, MS_truncation__delta_prediction_and_zlib_compression) != CVID_Unknown; + if (prediction_linear) + config.prediction = BinaryDataEncoder::Prediction_Linear; + else if (prediction_delta) + config.prediction = BinaryDataEncoder::Prediction_Delta; + cvidBinaryDataType = extractCVParam(paramContainer, MS_binary_data_type); // handle mix of zlib and numpress compression @@ -1960,34 +2299,39 @@ struct HandlerBinaryDataArray : public HandlerParamContainer case MS_zlib_compression: config.compression = BinaryDataEncoder::Compression_Zlib; break; - case MS_MS_Numpress_linear_prediction_compression: - config.numpress = BinaryDataEncoder::Numpress_Linear; - break; - case MS_MS_Numpress_positive_integer_compression: - config.numpress = BinaryDataEncoder::Numpress_Pic; - break; - case MS_MS_Numpress_short_logged_float_compression: - config.numpress = BinaryDataEncoder::Numpress_Slof; - break; - case MS_MS_Numpress_linear_prediction_compression_followed_by_zlib_compression: - config.numpress = BinaryDataEncoder::Numpress_Linear; - config.compression = BinaryDataEncoder::Compression_Zlib; - break; - case MS_MS_Numpress_positive_integer_compression_followed_by_zlib_compression: - config.numpress = BinaryDataEncoder::Numpress_Pic; - config.compression = BinaryDataEncoder::Compression_Zlib; - break; - case MS_MS_Numpress_short_logged_float_compression_followed_by_zlib_compression: - config.numpress = BinaryDataEncoder::Numpress_Slof; - config.compression = BinaryDataEncoder::Compression_Zlib; - break; - default: + case MS_MS_Numpress_linear_prediction_compression: + config.numpress = BinaryDataEncoder::Numpress_Linear; + break; + case MS_MS_Numpress_positive_integer_compression: + config.numpress = BinaryDataEncoder::Numpress_Pic; + break; + case MS_MS_Numpress_short_logged_float_compression: + config.numpress = BinaryDataEncoder::Numpress_Slof; + break; + case MS_MS_Numpress_linear_prediction_compression_followed_by_zlib_compression: + config.numpress = BinaryDataEncoder::Numpress_Linear; + config.compression = BinaryDataEncoder::Compression_Zlib; + break; + case MS_MS_Numpress_positive_integer_compression_followed_by_zlib_compression: + config.numpress = BinaryDataEncoder::Numpress_Pic; + config.compression = BinaryDataEncoder::Compression_Zlib; + break; + case MS_MS_Numpress_short_logged_float_compression_followed_by_zlib_compression: + config.numpress = BinaryDataEncoder::Numpress_Slof; + config.compression = BinaryDataEncoder::Compression_Zlib; + break; + default: throw runtime_error("[IO::HandlerBinaryDataArray] Unknown compression type."); } } - // if numpress is on, make sure output is directed to BinaryDataArray instead of IntegerDataArray - if (BinaryDataEncoder::Numpress_None != config.numpress) + // with mzMLb, binary data type is used to indicate whether the array should be restored into a BinaryDataArray or an IntegerDataArray + // with mzML, when numpress is on, binary data type is used to indicate whether PIC compression was used; + // when numpress is off, it is used to indicate the original array type + + // if numpress is on, make sure Numpress PIC arrays are directed to BinaryDataArray instead of IntegerDataArray + auto mzMLb_is = dynamic_cast*>(is_); + if (BinaryDataEncoder::Numpress_None != config.numpress && !mzMLb_is) switch (cvidBinaryDataType) { case MS_32_bit_integer: @@ -2023,6 +2367,91 @@ struct HandlerBinaryDataArray : public HandlerParamContainer return config; } + + + template + void readMzMLbBinaryDataArray(BinaryDataArrayType* binaryDataArray) + { +#ifndef WITHOUT_MZMLB + auto mzMLb_is = dynamic_cast*>(is_); + if (!mzMLb_is) + return; + + extractCVParamValue(*binaryDataArray, MS_external_HDF5_dataset, external_dataset_); + + string external_array_length; + if (extractCVParamValue(*binaryDataArray, MS_external_array_length, external_array_length)) + { + arrayLength_ = lexical_cast(external_array_length); + + // primary array types so set the default array length + if (binaryDataArray->hasCVParam(MS_m_z_array) || + binaryDataArray->hasCVParam(MS_time_array) || + binaryDataArray->hasCVParam(MS_intensity_array) || + binaryDataArray->hasCVParam(MS_wavelength_array)) + { + if (defaultArrayLength) + *defaultArrayLength = arrayLength_; + } + } + + string external_offset; + if (extractCVParamValue(*binaryDataArray, MS_external_offset, external_offset)) + external_offset_ = lexical_cast(external_offset); + + if (!external_dataset_.empty()) + { + (*mzMLb_is)->seek(external_dataset_, external_offset_, std::ios_base::beg); + + if (config.numpress != BinaryDataEncoder::Numpress_None) + { + vector buf(encodedLength_); + (*mzMLb_is)->read_opaque(external_dataset_, &buf[0], encodedLength_); + config.format = BinaryDataEncoder::Format_MzMLb; + BinaryDataEncoder encoder(config); + encoder.decode(&buf[0], buf.size(), binaryDataArray->data); + } + else + { + binaryDataArray->data.resize(arrayLength_); + if (arrayLength_ > 0) + (*mzMLb_is)->read(external_dataset_, &binaryDataArray->data[0], arrayLength_); + } + predict(); + } +#endif + } + + void predict() + { + switch (config.prediction) + { + case BinaryDataEncoder::Prediction_Delta: + switch (config.precision) + { + case BinaryDataEncoder::Precision_32: + for (size_t i = 2; i < binaryDataArray->data.size(); i++) binaryDataArray->data[i] = (float)binaryDataArray->data[i] + (float)binaryDataArray->data[i - 1] - (float)binaryDataArray->data[0]; + break; + case BinaryDataEncoder::Precision_64: + for (size_t i = 2; i < binaryDataArray->data.size(); i++) binaryDataArray->data[i] = binaryDataArray->data[i] + binaryDataArray->data[i - 1] - binaryDataArray->data[0]; + break; + } + break; + case BinaryDataEncoder::Prediction_Linear: + switch (config.precision) + { + case BinaryDataEncoder::Precision_32: + for (size_t i = 2; i < binaryDataArray->data.size(); i++) binaryDataArray->data[i] = (float)binaryDataArray->data[i] + 2.0f * (float)binaryDataArray->data[i - 1] - (float)binaryDataArray->data[i - 2] - (float)binaryDataArray->data[1]; + break; + case BinaryDataEncoder::Precision_64: + for (size_t i = 2; i < binaryDataArray->data.size(); i++) binaryDataArray->data[i] = binaryDataArray->data[i] + 2.0 * binaryDataArray->data[i - 1] - binaryDataArray->data[i - 2] - binaryDataArray->data[1]; + break; + } + break; + case BinaryDataEncoder::Prediction_None: + break; + } + } }; @@ -2046,8 +2475,10 @@ void write(minimxml::XMLWriter& writer, const Spectrum& spectrum, const MSData& attributes.add("index", spectrum.index); attributes.add("id", spectrum.id); // not an XML:ID if (!spectrum.spotID.empty()) - attributes.add("spotID", spectrum.spotID); + attributes.add("spotID", spectrum.spotID); + attributes.add("defaultArrayLength", spectrum.defaultArrayLength); + if (spectrum.dataProcessingPtr.get()) attributes.add("dataProcessingRef", encode_xml_id_copy(spectrum.dataProcessingPtr->id)); if (spectrum.sourceFilePtr.get()) @@ -2091,11 +2522,14 @@ void write(minimxml::XMLWriter& writer, const Spectrum& spectrum, const MSData& attributes.add("count", spectrum.binaryDataArrayPtrs.size() + spectrum.integerDataArrayPtrs.size()); writer.startElement("binaryDataArrayList", attributes); + BinaryDataEncoder::Config updated_config = config; + updated_config.type = BinaryDataEncoder::Type_Spectrum; + for (const auto& itr : spectrum.binaryDataArrayPtrs) - writeBinaryDataArray(writer, *itr, config); + writeBinaryDataArray(writer, *itr, updated_config); for (const auto& itr : spectrum.integerDataArrayPtrs) - writeBinaryDataArray(writer, *itr, config); + writeBinaryDataArray(writer, *itr, updated_config); writer.endElement(); // binaryDataArrayList } @@ -2116,13 +2550,15 @@ struct HandlerSpectrum : public HandlerParamContainer Spectrum* _spectrum = 0, const map* legacyIdRefToNativeId = 0, const MSData* _msd = 0, - const SpectrumIdentityFromXML *_spectrumID = 0) + const SpectrumIdentityFromXML *_spectrumID = 0, + std::istream* is = 0) : binaryDataFlag(_binaryDataFlag), spectrum(_spectrum), spectrumID(_spectrumID), legacyIdRefToNativeId(legacyIdRefToNativeId), msd(_msd), - handlerPrecursor_(0, legacyIdRefToNativeId) + handlerPrecursor_(0, legacyIdRefToNativeId), + handlerBinaryDataArray_(0, 0, 0, is) { } @@ -2193,8 +2629,8 @@ struct HandlerSpectrum : public HandlerParamContainer { handlerBinaryDataArray_.binaryDataArrayPtrs = &spectrum->binaryDataArrayPtrs; handlerBinaryDataArray_.integerDataArrayPtrs = &spectrum->integerDataArrayPtrs; - handlerBinaryDataArray_.defaultArrayLength = spectrum->defaultArrayLength; - handlerBinaryDataArray_.msd = msd; + handlerBinaryDataArray_.defaultArrayLength = &spectrum->defaultArrayLength; + handlerBinaryDataArray_.msd = msd; handlerBinaryDataArray_.binaryDataFlag = binaryDataFlag; return Status(Status::Delegate, &handlerBinaryDataArray_); } @@ -2221,6 +2657,7 @@ struct HandlerSpectrum : public HandlerParamContainer return Status(Status::Delegate, &handlerScan_); } } // end if name != cvParam + HandlerParamContainer::paramContainer = spectrum; return HandlerParamContainer::startElement(name, attributes, position); } @@ -2241,7 +2678,7 @@ PWIZ_API_DECL void read(std::istream& is, Spectrum& spectrum, const MSData* msd, const SpectrumIdentityFromXML *id) { - HandlerSpectrum handler(binaryDataFlag, &spectrum, legacyIdRefToNativeId, msd, id); + HandlerSpectrum handler(binaryDataFlag, &spectrum, legacyIdRefToNativeId, msd, id, &is); handler.version = version; SAXParser::parse(is, handler); } @@ -2259,7 +2696,19 @@ void write(minimxml::XMLWriter& writer, const Chromatogram& chromatogram, XMLWriter::Attributes attributes; attributes.add("index", chromatogram.index); attributes.add("id", chromatogram.id); // not an XML:ID - attributes.add("defaultArrayLength", chromatogram.defaultArrayLength); + +#ifndef WITHOUT_MZMLB + boost::iostreams::stream* mzMLb_os = dynamic_cast*>(&writer.getOutputStream()); + if (mzMLb_os) + { + attributes.add("defaultArrayLength", 0); + } + else +#endif + { + attributes.add("defaultArrayLength", chromatogram.defaultArrayLength); + } + if (chromatogram.dataProcessingPtr.get()) attributes.add("dataProcessingRef", encode_xml_id_copy(chromatogram.dataProcessingPtr->id)); @@ -2278,12 +2727,15 @@ void write(minimxml::XMLWriter& writer, const Chromatogram& chromatogram, attributes.clear(); attributes.add("count", chromatogram.binaryDataArrayPtrs.size() + chromatogram.integerDataArrayPtrs.size()); writer.startElement("binaryDataArrayList", attributes); + + BinaryDataEncoder::Config updated_config = config; + updated_config.type = BinaryDataEncoder::Type_Chromatogram; for (const auto& itr : chromatogram.binaryDataArrayPtrs) - writeBinaryDataArray(writer, *itr, config); + writeBinaryDataArray(writer, *itr, updated_config); for (const auto& itr : chromatogram.integerDataArrayPtrs) - writeBinaryDataArray(writer, *itr, config); + writeBinaryDataArray(writer, *itr, updated_config); writer.endElement(); // binaryDataArrayList } @@ -2298,9 +2750,11 @@ struct HandlerChromatogram : public HandlerParamContainer Chromatogram* chromatogram; HandlerChromatogram(BinaryDataFlag _binaryDataFlag, - Chromatogram* _chromatogram = 0) + Chromatogram* _chromatogram = 0, + std::istream* is = 0) : binaryDataFlag(_binaryDataFlag), - chromatogram(_chromatogram) + chromatogram(_chromatogram), + handlerBinaryDataArray_(0, 0, 0, is) {} virtual Status startElement(const string& name, @@ -2340,7 +2794,7 @@ struct HandlerChromatogram : public HandlerParamContainer { handlerBinaryDataArray_.binaryDataArrayPtrs = &chromatogram->binaryDataArrayPtrs; handlerBinaryDataArray_.integerDataArrayPtrs = &chromatogram->integerDataArrayPtrs; - handlerBinaryDataArray_.defaultArrayLength = chromatogram->defaultArrayLength; + handlerBinaryDataArray_.defaultArrayLength = &chromatogram->defaultArrayLength; handlerBinaryDataArray_.binaryDataFlag = binaryDataFlag; return Status(Status::Delegate, &handlerBinaryDataArray_); } @@ -2364,7 +2818,7 @@ struct HandlerChromatogram : public HandlerParamContainer PWIZ_API_DECL void read(std::istream& is, Chromatogram& chromatogram, BinaryDataFlag binaryDataFlag) { - HandlerChromatogram handler(binaryDataFlag, &chromatogram); + HandlerChromatogram handler(binaryDataFlag, &chromatogram, &is); SAXParser::parse(is, handler); } @@ -2439,6 +2893,15 @@ void write(minimxml::XMLWriter& writer, const SpectrumList& spectrumList, const write(writer, *spectrum, msd, config); spectrum->index += skipped; // restore original index in case the spectrum is held in memory and the same spectrum is written again } + +#ifndef WITHOUT_MZMLB + boost::iostreams::stream* mzMLb_os = dynamic_cast*>(&writer.getOutputStream()); + if (mzMLb_os) + { + if (spectrumPositions) + spectrumPositions->push_back(writer.positionNext()-2); + } +#endif writer.endElement(); } @@ -2561,6 +3024,15 @@ void write(minimxml::XMLWriter& writer, const ChromatogramList& chromatogramList chromatogram->index += skipped; // restore original index in case same chromatogram is written again } +#ifndef WITHOUT_MZMLB + boost::iostreams::stream* mzMLb_os = dynamic_cast*>(&writer.getOutputStream()); + if (mzMLb_os) + { + if (chromatogramPositions) + chromatogramPositions->push_back(writer.positionNext()-2); + } +#endif + writer.endElement(); } diff --git a/pwiz/data/msdata/Index_mzML.cpp b/pwiz/data/msdata/Index_mzML.cpp index 8a8b321734..b0a48bce3b 100644 --- a/pwiz/data/msdata/Index_mzML.cpp +++ b/pwiz/data/msdata/Index_mzML.cpp @@ -26,6 +26,10 @@ #include "pwiz/utility/minimxml/SAXParser.hpp" #include "boost/iostreams/positioning.hpp" #include "pwiz/data/msdata/IO.hpp" +#ifndef WITHOUT_MZMLB +#include "mzmlb/Connection_mzMLb.hpp" +using namespace pwiz::msdata::mzmlb; +#endif using namespace pwiz::util; using namespace pwiz::minimxml; @@ -266,7 +270,8 @@ class HandlerIndexCreator : public SAXParser::Handler spectrumIndex_(spectrumIndex), chromatogramCount_(chromatogramCount), chromatogramIndex_(chromatogramIndex), - legacyIdRefToNativeId(&legacyIdRefToNativeId) + legacyIdRefToNativeId(&legacyIdRefToNativeId), + ignore_(spectrumCount_ > 0) { version = schemaVersion_; } @@ -275,6 +280,9 @@ class HandlerIndexCreator : public SAXParser::Handler const Attributes& attributes, stream_offset position) { + if (ignore_) + return Status::Ok; + if (name == "spectrum") { SpectrumIdentityFromXML* si; @@ -284,31 +292,31 @@ class HandlerIndexCreator : public SAXParser::Handler getAttribute(attributes, "id", si->id); getAttribute(attributes, "spotID", si->spotID); - // mzML 1.0 - if (version == 1) + // mzML 1.0 + if (version == 1) + { + string idRef, nativeID; + getAttribute(attributes, "id", idRef); + getAttribute(attributes, "nativeID", nativeID); + if (nativeID.empty()) + si->id = idRef; + else { - string idRef, nativeID; - getAttribute(attributes, "id", idRef); - getAttribute(attributes, "nativeID", nativeID); - if (nativeID.empty()) - si->id = idRef; - else + try { - try - { - lexical_cast(nativeID); - si->id = "scan=" + nativeID; - } - catch (exception&) - { - si->id = nativeID; - } - (*legacyIdRefToNativeId)[idRef] = si->id; + lexical_cast(nativeID); + si->id = "scan=" + nativeID; + } + catch (exception&) + { + si->id = nativeID; } + (*legacyIdRefToNativeId)[idRef] = si->id; } + } - si->index = spectrumCount_; - si->sourceFilePosition = position; + si->index = spectrumCount_; + si->sourceFilePosition = position; ++spectrumCount_; } @@ -337,6 +345,7 @@ class HandlerIndexCreator : public SAXParser::Handler size_t& chromatogramCount_; vector& chromatogramIndex_; map* legacyIdRefToNativeId; + bool ignore_; }; } // namespace @@ -396,9 +405,78 @@ void Index_mzML::Impl::createIndex() const } catch (runtime_error&) { +#ifndef WITHOUT_MZMLB + boost::iostreams::stream* mzMLb_is = dynamic_cast*>(&*is_); + if (mzMLb_is) + { + + if ((*mzMLb_is)->exists("mzML_spectrumIndex")) + { + std::vector positions((*mzMLb_is)->size("mzML_spectrumIndex")); + (*mzMLb_is)->read("mzML_spectrumIndex", &positions[0], positions.size()); + + { + streamsize size = (*mzMLb_is)->size("mzML_spectrumIndex_idRef"); + char* buf = new char[size]; + (*mzMLb_is)->read("mzML_spectrumIndex_idRef", buf, size); + istringstream idRefs(string(buf, size)); + delete buf; + + for (size_t i = 0; i < positions.size() - 1; i++) + { + spectrumIndex_.push_back(SpectrumIdentityFromXML()); + spectrumIndex_.back().index = spectrumCount_; + getline(idRefs, spectrumIndex_.back().id, '\0'); + spectrumIndex_.back().sourceFilePosition = positions[i]; + ++spectrumCount_; + } + + } + + + if ((*mzMLb_is)->exists("mzML_spectrumIndex_spotID")) + { + streamsize size = (*mzMLb_is)->size("mzML_spectrumIndex_spotID"); + char* buf = new char[size]; + (*mzMLb_is)->read("mzML_spectrumIndex_spotID", buf, size); + istringstream spotIDs(string(buf, size)); + delete buf; + + for (size_t i = 0; i < positions.size() - 1; i++) + getline(spotIDs, spectrumIndex_.back().spotID, '\0'); + } + } + + if ((*mzMLb_is)->exists("mzML_chromatogramIndex")) + { + std::vector positions((*mzMLb_is)->size("mzML_chromatogramIndex")); + (*mzMLb_is)->read("mzML_chromatogramIndex", &positions[0], positions.size()); + + streamsize size = (*mzMLb_is)->size("mzML_chromatogramIndex_idRef"); + char* buf = new char[size]; + (*mzMLb_is)->read("mzML_chromatogramIndex_idRef", buf, size); + istringstream idRefs(string(buf, size)); + delete buf; + + for (size_t i = 0; i < positions.size() - 1; i++) + { + chromatogramIndex_.push_back(ChromatogramIdentity()); + chromatogramIndex_.back().index = chromatogramCount_; + getline(idRefs, chromatogramIndex_.back().id, '\0'); + chromatogramIndex_.back().sourceFilePosition = positions[i]; + ++chromatogramCount_; + } + } + + spectrumCount_ = 0; + chromatogramCount_ = 0; + } +#endif + // TODO: log warning that the index was corrupt/missing is_->clear(); is_->seekg(0); + HandlerIndexCreator handler(schemaVersion_, spectrumCount_, spectrumIndex_, legacyIdRefToNativeId_, chromatogramCount_, chromatogramIndex_); diff --git a/pwiz/data/msdata/Jamfile.jam b/pwiz/data/msdata/Jamfile.jam index 0cbc47e43f..99353e8b67 100644 --- a/pwiz/data/msdata/Jamfile.jam +++ b/pwiz/data/msdata/Jamfile.jam @@ -71,6 +71,7 @@ lib pwiz_data_msdata_core /ext/boost//filesystem /ext/boost//thread /ext/zlib//z + [ mzmlb-build mzmlb//pwiz_data_msdata_mzmlb ] : # default-build : # usage-requirements pwiz_data_msdata_version @@ -82,6 +83,7 @@ lib pwiz_data_msdata_core /ext/boost//filesystem /ext/boost//thread /ext/zlib//z + [ mzmlb-build mzmlb//pwiz_data_msdata_mzmlb ] ; lib pwiz_data_msdata diff --git a/pwiz/data/msdata/MSDataFile.cpp b/pwiz/data/msdata/MSDataFile.cpp index 12a00b7286..e5a0a2656f 100644 --- a/pwiz/data/msdata/MSDataFile.cpp +++ b/pwiz/data/msdata/MSDataFile.cpp @@ -29,6 +29,10 @@ #include "Serializer_mzXML.hpp" #include "Serializer_MGF.hpp" #include "Serializer_MSn.hpp" +#ifndef WITHOUT_MZMLB +#include "mzmlb/Connection_mzMLb.hpp" +using namespace pwiz::msdata::mzmlb; +#endif #ifndef WITHOUT_MZ5 #include "Serializer_mz5.hpp" #endif @@ -157,6 +161,7 @@ void writeStream(ostream& os, const MSData& msd, const MSDataFile::WriteConfig& break; } case MSDataFile::Format_mzML: + case MSDataFile::Format_mzMLb: { Serializer_mzML::Config serializerConfig; serializerConfig.binaryDataEncoderConfig = config.binaryDataEncoderConfig; @@ -232,6 +237,17 @@ void MSDataFile::write(const MSData& msd, #else Serializer_mz5 serializer(config); serializer.write(filename, msd, iterationListenerRegistry, config.useWorkerThreads); +#endif + break; + } + case MSDataFile::Format_mzMLb: + { +#ifdef WITHOUT_MZMLB + throw runtime_error("[MSDataFile::write()] library was not built with mzMLb support."); +#else + Connection_mzMLb con(filename, config.mzMLb_chunk_size, config.mzMLb_compression_level); + boost::iostreams::stream mzMLb_os(con, config.mzMLb_chunk_size); + writeStream(mzMLb_os, msd, config, iterationListenerRegistry); #endif break; } @@ -331,6 +347,9 @@ PWIZ_API_DECL ostream& operator<<(ostream& os, MSDataFile::Format format) case MSDataFile::Format_MZ5: os << "mz5"; return os; + case MSDataFile::Format_mzMLb: + os << "mzMLb"; + return os; default: os << "Unknown"; return os; @@ -345,7 +364,8 @@ PWIZ_API_DECL ostream& operator<<(ostream& os, const MSDataFile::WriteConfig& co config.format == MSDataFile::Format_mzXML) os << " " << config.binaryDataEncoderConfig << " indexed=\"" << boolalpha << config.indexed << "\""; - else if (config.format == MSDataFile::Format_MZ5) + else if (config.format == MSDataFile::Format_MZ5 || + config.format == MSDataFile::Format_mzMLb) os << " " << config.binaryDataEncoderConfig; return os; } diff --git a/pwiz/data/msdata/MSDataFile.hpp b/pwiz/data/msdata/MSDataFile.hpp index bf80a5ec39..ef96e52478 100644 --- a/pwiz/data/msdata/MSDataFile.hpp +++ b/pwiz/data/msdata/MSDataFile.hpp @@ -46,7 +46,7 @@ struct PWIZ_API_DECL MSDataFile : public MSData bool calculateSourceFileChecksum = false); /// data format for write() - enum PWIZ_API_DECL Format {Format_Text, Format_mzML, Format_mzXML, Format_MGF, Format_MS1, Format_CMS1, Format_MS2, Format_CMS2, Format_MZ5}; + enum PWIZ_API_DECL Format {Format_Text, Format_mzML, Format_mzXML, Format_MGF, Format_MS1, Format_CMS1, Format_MS2, Format_CMS2, Format_MZ5, Format_mzMLb}; /// configuration for write() struct PWIZ_API_DECL WriteConfig @@ -57,6 +57,9 @@ struct PWIZ_API_DECL MSDataFile : public MSData bool gzipped; // if true, file is written as .gz bool useWorkerThreads; + int mzMLb_compression_level = 0; + int mzMLb_chunk_size = 1048576; + /// when true, if an error is seen when enumerating a spectrum or chromatogram, it will be skipped and enumeration will continue; /// when false an error will immediately stop enumeration bool continueOnError; diff --git a/pwiz/data/msdata/ReaderTest.cpp b/pwiz/data/msdata/ReaderTest.cpp index 34bb416a6a..bb37a9b8d7 100755 --- a/pwiz/data/msdata/ReaderTest.cpp +++ b/pwiz/data/msdata/ReaderTest.cpp @@ -261,7 +261,7 @@ void testIdentifyFileFormat() { auto readerTypes = readerList.getTypes(); set readerTypeSet(readerTypes.begin(), readerTypes.end()); - set expectedTypeSet{ "mzML", "mzXML", "MS1", "MS2", "Mascot Generic", "Bruker Data Exchange", "MZ5", + set expectedTypeSet{ "mzML", "mzMLb", "mzXML", "MS1", "MS2", "Mascot Generic", "Bruker Data Exchange", "MZ5", "Sciex WIFF/WIFF2", "AB/Sciex T2D", "Agilent MassHunter", "Bruker FID", "Bruker YEP", "Bruker BAF", "Bruker U2", "Bruker TDF", #if defined(PWIZ_READER_MOBILION) "Mobilion MBI", @@ -279,7 +279,7 @@ void testIdentifyFileFormat() auto readerTypes = readerList.getCvTypes(); set readerCvTypeSet(readerTypes.begin(), readerTypes.end()); set expectedCvTypeSet{ MS_mzML_format, MS_ISB_mzXML_format, MS_MS1_format, MS_MS2_format, MS_Mascot_MGF_format, MS_Bruker_XML_format, MS_mz5_format, - MS_ABI_WIFF_format, MS_SCIEX_TOF_TOF_T2D_format, MS_Agilent_MassHunter_format, + MS_mzMLb_format, MS_ABI_WIFF_format, MS_SCIEX_TOF_TOF_T2D_format, MS_Agilent_MassHunter_format, MS_Bruker_FID_format, MS_Bruker_Agilent_YEP_format, MS_Bruker_BAF_format, MS_Bruker_U2_format, MS_Bruker_TDF_format, MS_mass_spectrometer_file_format, MS_Thermo_RAW_format, MS_UIMF_format, MS_Waters_raw_format }; auto expectedButNotFound = expectedCvTypeSet - readerCvTypeSet; diff --git a/pwiz/data/msdata/Serializer_mzML.cpp b/pwiz/data/msdata/Serializer_mzML.cpp index c827330194..d7ddaf6534 100644 --- a/pwiz/data/msdata/Serializer_mzML.cpp +++ b/pwiz/data/msdata/Serializer_mzML.cpp @@ -31,6 +31,10 @@ #include "pwiz/utility/minimxml/XMLWriter.hpp" #include "pwiz/utility/minimxml/SAXParser.hpp" #include "pwiz/utility/misc/Std.hpp" +#ifndef WITHOUT_MZMLB +#include "mzmlb/Connection_mzMLb.hpp" +using namespace pwiz::msdata::mzmlb; +#endif namespace pwiz { namespace msdata { @@ -162,7 +166,12 @@ void Serializer_mzML::Impl::write(ostream& os, const MSData& msd, // start +#ifndef WITHOUT_MZMLB + boost::iostreams::stream* mzMLb_os = dynamic_cast*>(&os); + if (config_.indexed && !mzMLb_os) +#else if (config_.indexed) +#endif { XMLWriter::Attributes attributes; attributes.push_back(make_pair("xmlns", "http://psi.hupo.org/ms/mzml")); @@ -186,8 +195,54 @@ void Serializer_mzML::Impl::write(ostream& os, const MSData& msd, return; // end - + +#ifndef WITHOUT_MZMLB + if (mzMLb_os) + { + if (!spectrumPositions.empty()) + { + (*mzMLb_os)->write("mzML_spectrumIndex", &spectrumPositions[0], spectrumPositions.size()); + + ostringstream idRefs; + ostringstream spotIDs; + bool hasSpotIDs = false; + + for (unsigned int i = 0; i < spectrumPositions.size() - 1; ++i) + { + const SpectrumIdentity& si = msd.run.spectrumListPtr->spectrumIdentity(i); + idRefs.write(si.id.c_str(), si.id.size() + 1); + + if(!si.spotID.empty()) + { + spotIDs.write(si.spotID.c_str(), si.spotID.size() + 1); + hasSpotIDs = true; + } + } + + (*mzMLb_os)->write("mzML_spectrumIndex_idRef", idRefs.str().c_str(), idRefs.str().size()); + + if (hasSpotIDs) + (*mzMLb_os)->write("mzML_spectrumIndex_spotID", spotIDs.str().c_str(), spotIDs.str().size()); + } + + if (!chromatogramPositions.empty()) + { + (*mzMLb_os)->write("mzML_chromatogramIndex", &chromatogramPositions[0], chromatogramPositions.size()); + + ostringstream idRefs; + for (unsigned int i = 0; i < chromatogramPositions.size() - 1; ++i) + { + const ChromatogramIdentity& ci = msd.run.chromatogramListPtr->chromatogramIdentity(i); + idRefs.write(ci.id.c_str(), ci.id.size() + 1); + } + + (*mzMLb_os)->write("mzML_chromatogramIndex_idRef", idRefs.str().c_str(), idRefs.str().size()); + } + } + else if (config_.indexed) +#else if (config_.indexed) +#endif { stream_offset indexListOffset = xmlWriter.positionNext(); diff --git a/pwiz/data/msdata/mzmlb/Connection_mzMLb.cpp b/pwiz/data/msdata/mzmlb/Connection_mzMLb.cpp new file mode 100644 index 0000000000..e6c3ef35ba --- /dev/null +++ b/pwiz/data/msdata/mzmlb/Connection_mzMLb.cpp @@ -0,0 +1,616 @@ +// +// $Id: Connection_mzMLb.cpp +// +// +// Original authors: Andrew Dowsey +// +// Copyright 2017 biospi Laboratory, +// University of Bristol, UK +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + + +#include "Connection_mzMLb.hpp" +#include +#include +#include + + +//using namespace std; +using namespace boost::iostreams; + +#define CURRENT_VERSION "mzMLb 1.0" + +namespace pwiz { +namespace msdata { +namespace mzmlb { + +Connection_mzMLb::Connection_mzMLb(const std::string& id, bool identifyOnly) +{ + H5Eset_auto(H5E_DEFAULT, NULL, NULL); + + // open HDF5 file for reading + file_ = H5Fopen(id.c_str(), H5F_ACC_RDONLY, H5P_DEFAULT); + if (file_ < 0) + throw std::runtime_error("[Connection_mzMLb::open()] Could not open mzMLb file for reading."); + + // open dataset with stored mzML XML to find chunk size + hsize_t chunk_size; + hid_t dataset = H5Dopen(file_, "mzML", H5P_DEFAULT); + { + if (dataset < 0) + { + H5Fclose(file_); + throw std::runtime_error("[Connection_mzMLb::open()] Could not open mzML dataset for reading."); + } + + hid_t dcpl = H5Dget_create_plist(dataset); + { + H5Pget_chunk(dcpl, 1, &chunk_size); + } + H5Pclose(dcpl); + } + H5Dclose(dataset); + + // open again with appropraite dataset cache + hid_t dapl = H5Pcreate(H5P_DATASET_ACCESS); + { + size_t nslots; // Number of slots in the hash table + size_t nbytes; // Size of chunk cache in bytes + double w0; // Chunk preemption policy + H5Pget_chunk_cache(dapl, &nslots, &nbytes, &w0); + nbytes = (nbytes > chunk_size) ? nbytes : chunk_size; + w0 = 1.0; // since pwiz only writes a spectrum once + H5Pset_chunk_cache(dapl, nslots, nbytes, w0); + mzML_.dataset = H5Dopen(file_, "mzML", dapl); + mzML_.space = H5Dget_space(mzML_.dataset); + hsize_t size, maxdims; + H5Sget_simple_extent_dims(mzML_.space, &size, &maxdims); + mzML_.size = size; + + // get version number + hid_t aid = H5Aopen(mzML_.dataset, "version", H5P_DEFAULT); + { + if (aid < 0) + { + H5Aclose(aid); + close(); + throw std::runtime_error("[Connection_mzMLb::open()] This does not look like an mzMLb file."); + } + + hid_t atype = H5Aget_type(aid); + // at this point it's definitely an mzMLb file and if version is wrong it should not throw an exception that would cause Reader::identify to not identify the file + if (!identifyOnly) + { + hid_t atype_mem = H5Tget_native_type(atype, H5T_DIR_ASCEND); + { + char* ver = new char[H5Tget_size(atype)]; + H5Aread(aid, atype_mem, ver); + std::string version = ver; + if (version != CURRENT_VERSION) + { + H5Aclose(aid); + H5Aclose(atype_mem); + close(); + throw std::runtime_error("[Connection_mzMLb::open()] Cannot read this version of mzMLb: \"" + version + "\" (or version is not fixed-length string); only " CURRENT_VERSION " is supported"); + } + + } + H5Aclose(atype_mem); + } + H5Aclose(atype); + } + H5Aclose(aid); + } + H5Pclose(dapl); + + if (identifyOnly) + { + close(); + return; + } + + opaque_id_ = H5Tcreate(H5T_OPAQUE, 1); + } + + +Connection_mzMLb::Connection_mzMLb(const std::string& id, int chunk_size, int compression_level) : + chunk_size_(chunk_size), + compression_level_(compression_level) +{ + H5Eset_auto(H5E_DEFAULT, NULL, NULL); + + // create/truncate HDF5 file for writing + hid_t fapl = H5Pcreate(H5P_FILE_ACCESS); + { + int nelemts; // Dummy parameter in API, no longer used + size_t nslots; // Number of slots in the hash table + size_t nbytes; // Size of chunk cache in bytes + double w0; // Chunk preemption policy + H5Pget_cache(fapl, &nelemts, &nslots, &nbytes, &w0); + nbytes = (nbytes > chunk_size) ? nbytes : chunk_size; // Set per dataset cache to twice the chunk size please + w0 = 1.0; // since pwiz only writes a spectrum once + H5Pset_cache(fapl, nelemts, nslots, nbytes, w0); + file_ = H5Fcreate(id.c_str(), H5F_ACC_TRUNC, H5P_DEFAULT, fapl); + if (file_ < 0) + { + H5Pclose(fapl); + throw std::runtime_error("[Connection_mzMLb::Connection_mzMLb()] Could not open or create mzMLb file for writing."); + } + + // create dataset to store mzML XML + hid_t dcpl = H5Pcreate(H5P_DATASET_CREATE); + { + hsize_t cdims = chunk_size; + H5Pset_chunk(dcpl, 1, &cdims); + if (compression_level > 0) + { + hsize_t level = compression_level; + H5Pset_deflate(dcpl, level); + } + H5Pset_fletcher32(dcpl); + hsize_t maxdims = H5S_UNLIMITED; + mzML_.space = H5Screate_simple(1, &mzML_.size, &maxdims); + mzML_.dataset = H5Dcreate(file_, "mzML", H5T_NATIVE_CHAR, mzML_.space, H5P_DEFAULT, dcpl, H5P_DEFAULT); + } + H5Pclose(dcpl); + + // write version string + hid_t aid = H5Screate(H5S_SCALAR); + { + hid_t atype = H5Tcopy(H5T_C_S1); + { + H5Tset_size(atype, 10); + H5Tset_strpad(atype, H5T_STR_NULLTERM); + hid_t attrid = H5Acreate(mzML_.dataset, "version", atype, aid, H5P_DEFAULT, H5P_DEFAULT); + { + std::string version_out = CURRENT_VERSION; + H5Awrite(attrid, atype, version_out.c_str()); + } + H5Aclose(attrid); + } + H5Tclose(atype); + } + H5Sclose(aid); + } + H5Pclose(fapl); + + opaque_id_ = H5Tcreate(H5T_OPAQUE, 1); +} + + +void Connection_mzMLb::close() +{ + H5Tclose(opaque_id_); + + H5Dclose(mzML_.dataset); + H5Sclose(mzML_.space); + + for (std::map::iterator it = binary_.begin(); it != binary_.end(); ++it) + { + H5Dclose(it->second.dataset); + H5Sclose(it->second.space); + } + + H5Fclose(file_); +} + + +// read mzMLb "mzML" dataset +std::streamsize Connection_mzMLb::read(char* s, std::streamsize n) +{ + // don't read past end of dataset + if (mzML_.pos + n > mzML_.size) + { + n = mzML_.size - mzML_.pos; + } + + if (n > 0) + { + // read + hsize_t count = n; + H5Sselect_hyperslab(mzML_.space, H5S_SELECT_SET, &mzML_.pos, NULL, &count, NULL); + hid_t mspace = H5Screate_simple(1, &count, &count); + { + H5Dread(mzML_.dataset, H5T_NATIVE_CHAR, mspace, mzML_.space, H5P_DEFAULT, s); + } + H5Sclose(mspace); + + mzML_.pos += n; + + return n; + } + else + { + return -1; + } +} + + +// write mzMLb "mzML" dataset +std::streamsize Connection_mzMLb::write(const char* s, std::streamsize n) +{ + // extend dataset size if needed + if (mzML_.pos + n > mzML_.size) + { + mzML_.size = mzML_.pos + n; + H5Dset_extent(mzML_.dataset, &mzML_.size); + H5Sclose(mzML_.space); + mzML_.space = H5Dget_space(mzML_.dataset); + } + + // write + hsize_t count = n; + H5Sselect_hyperslab(mzML_.space, H5S_SELECT_SET, &mzML_.pos, NULL, &count, NULL); + hid_t mspace = H5Screate_simple(1, &count, &count); + { + H5Dwrite(mzML_.dataset, H5T_NATIVE_CHAR, mspace, mzML_.space, H5P_DEFAULT, s); + } + H5Sclose(mspace); + + mzML_.pos += n; + + return n; +} + + +// seek mzMLb "mzML" dataset +stream_offset Connection_mzMLb::seek(stream_offset off, std::ios_base::seekdir way) +{ + switch (way) + { + case std::ios_base::beg: + mzML_.pos = off; + break; + case std::ios_base::cur: + mzML_.pos += off; + break; + case std::ios_base::end: + default: + mzML_.pos = mzML_.size - off; + break; + } + + + return mzML_.pos; +} + + +// read mzMLb mzML index +/*void Connection_mzMLb::readIndex(const std::string& id, std::vector& positions) +{ + // open dataset to find chunk size + hsize_t chunk_size; + hid_t dataset = H5Dopen(file_, id.c_str(), H5P_DEFAULT); + { + if (dataset < 0) + { + throw std::runtime_error("[Connection_mzMLb::read()] Could not open dataset " + id + " for reading."); + } + hid_t dcpl = H5Dget_create_plist(dataset); + { + H5Pget_chunk(dcpl, 1, &chunk_size); + } + H5Pclose(dcpl); + } + H5Dclose(dataset); + + // open dataset + hid_t dapl = H5Pcreate(H5P_DATASET_ACCESS); + { + size_t nslots; // Number of slots in the hash table + size_t nbytes; // Size of chunk cache in bytes + double w0; // Chunk preemption policy + H5Pget_chunk_cache(dapl, &nslots, &nbytes, &w0); + nbytes = (nbytes > chunk_size) ? nbytes : chunk_size; // Set per dataset cache to twice the chunk size please + w0 = 1.0; // since pwiz only writes a spectrum once + H5Pset_chunk_cache(dapl, nslots, nbytes, w0); + hid_t dataset = H5Dopen(file_, id.c_str(), dapl); + { + hid_t space = H5Dget_space(dataset); + { + hsize_t size, maxdims; + H5Sget_simple_extent_dims(space, &size, &maxdims); + + // read + hid_t mspace = H5Screate_simple(1, &size, &size); + { + std::vector positions_(size); + H5Dread(dataset, H5T_NATIVE_LLONG, mspace, space, H5P_DEFAULT, &positions_[0]); + positions.assign(positions_.begin(), positions_.end()); + } + H5Sclose(mspace); + } + H5Sclose(space); + } + H5Dclose(dataset); + } + H5Pclose(dapl); +} + + +// write mzMLb mzML index +void Connection_mzMLb::writeIndex(const std::string& id, const std::vector& positions) +{ + // create dataset + hid_t dcpl = H5Pcreate(H5P_DATASET_CREATE); + { + hsize_t cdims = chunk_size_; + H5Pset_chunk(dcpl, 1, &cdims); + if (compression_level_ > 0) + { + hsize_t level = compression_level_; + H5Pset_shuffle(dcpl); + H5Pset_deflate(dcpl, level); + } + H5Pset_fletcher32(dcpl); + hsize_t maxdims = H5S_UNLIMITED; + hsize_t count = positions.size(); + hid_t space = H5Screate_simple(1, &count, &maxdims); + { + hid_t dataset = H5Dcreate(file_, id.c_str(), H5T_NATIVE_LLONG, space, H5P_DEFAULT, dcpl, H5P_DEFAULT); + { + // write + std::vector positions_(positions.begin(), positions.end()); + hid_t mspace = H5Screate_simple(1, &count, &count); + { + H5Dwrite(dataset, H5T_NATIVE_LLONG, mspace, space, H5P_DEFAULT, &positions_[0]); + } + H5Sclose(mspace); + } + H5Dclose(dataset); + } + H5Sclose(space); + } + H5Pclose(dcpl); + }*/ + + +bool Connection_mzMLb::exists(const std::string& id) +{ + return H5Lexists(file_, id.c_str(), H5P_DEFAULT) > 0; +} + + +std::streamsize Connection_mzMLb::size(const std::string& id) +{ + hid_t dataset = H5Dopen(file_, id.c_str(), H5P_DEFAULT); + + if (dataset < 0) + throw std::runtime_error("[Connection_mzMLb::read()] Could not open dataset " + id + "."); + + hid_t space = H5Dget_space(dataset); + + hsize_t size; + H5Sget_simple_extent_dims(space, &size, 0); + + H5Sclose(space); + H5Dclose(dataset); + + return size; +} + + +std::streamsize Connection_mzMLb::read_opaque(const std::string& id, void* buf, std::streamsize n) +{ + return read(id, buf, n, opaque_id_); +} + + +std::streamsize Connection_mzMLb::read(const std::string& id, char* buf, std::streamsize n) +{ + return read(id, buf, n, H5T_NATIVE_CHAR); +} + + +std::streamsize Connection_mzMLb::read(const std::string& id, double* buf, std::streamsize n) +{ + return read(id, buf, n, H5T_NATIVE_DOUBLE); +} + + +std::streamsize Connection_mzMLb::read(const std::string& id,long* buf, std::streamsize n) +{ + return read(id, buf, n, H5T_NATIVE_LONG); +} + + +std::streamsize Connection_mzMLb::read(const std::string& id, long long* buf, std::streamsize n) +{ + return read(id, buf, n, H5T_NATIVE_LLONG); +} + + +std::streamsize Connection_mzMLb::read(const std::string& id, void* buf, std::streamsize n, hid_t native_format) +{ + Stream& s_ = binary_[id]; + if (!s_.dataset) + { + // open dataset to find chunk size + hsize_t chunk_size; + hid_t dataset = H5Dopen(file_, id.c_str(), H5P_DEFAULT); + { + if (dataset < 0) + { + throw std::runtime_error("[Connection_mzMLb::read()] Could not open dataset " + id + " for reading."); + } + hid_t dcpl = H5Dget_create_plist(dataset); + { + H5Pget_chunk(dcpl, 1, &chunk_size); + } + H5Pclose(dcpl); + } + H5Dclose(dataset); + + // open dataset + hid_t dapl = H5Pcreate(H5P_DATASET_ACCESS); + { + size_t nslots; // Number of slots in the hash table + size_t nbytes; // Size of chunk cache in bytes + double w0; // Chunk preemption policy + H5Pget_chunk_cache(dapl, &nslots, &nbytes, &w0); + nbytes = (nbytes > chunk_size) ? nbytes : chunk_size; // Set per dataset cache to twice the chunk size please + w0 = 1.0; // since pwiz only writes a spectrum once + H5Pset_chunk_cache(dapl, nslots, nbytes, w0); + s_.dataset = H5Dopen(file_, id.c_str(), dapl); + s_.space = H5Dget_space(s_.dataset); + hsize_t size; + H5Sget_simple_extent_dims(s_.space, &size, 0); + s_.size = size; + } + H5Pclose(dapl); + } + + // don't read past end of dataset + if (s_.pos + n > s_.size) + { + n = s_.size - s_.pos; + } + + if (n > 0) + { + // read + hsize_t count = n; + H5Sselect_hyperslab(s_.space, H5S_SELECT_SET, &s_.pos, NULL, &count, NULL); + hid_t mspace = H5Screate_simple(1, &count, &count); + { + H5Dread(s_.dataset, native_format, mspace, s_.space, H5P_DEFAULT, buf); + } + H5Sclose(mspace); + + s_.pos += n; + + return n; + } + else + { + return -1; + } +} + + +std::streamsize Connection_mzMLb::write_opaque(const std::string& id, const void* buf, std::streamsize n) +{ + return write(id, buf, n, opaque_id_, opaque_id_, 1); +} + + +std::streamsize Connection_mzMLb::write(const std::string& id, const char* buf, std::streamsize n) +{ + return write(id, buf, n, H5T_NATIVE_CHAR, H5T_NATIVE_CHAR, sizeof(char)); +} + + +std::streamsize Connection_mzMLb::write(const std::string& id, const float* buf, std::streamsize n) +{ + return write(id, buf, n, H5T_NATIVE_FLOAT, H5T_NATIVE_FLOAT, sizeof(float)); +} + + +std::streamsize Connection_mzMLb::write(const std::string& id, const double* buf, std::streamsize n) +{ + return write(id, buf, n, H5T_NATIVE_DOUBLE, H5T_NATIVE_DOUBLE, sizeof(double)); +} + + +std::streamsize Connection_mzMLb::write(const std::string& id, const long* buf, std::streamsize n) +{ + return write(id, buf, n, H5T_NATIVE_LONG, H5T_NATIVE_LONG, sizeof(long)); +} + + +std::streamsize Connection_mzMLb::write(const std::string& id, const long long* buf, std::streamsize n) +{ + return write(id, buf, n, H5T_NATIVE_LLONG, H5T_NATIVE_LLONG, sizeof(long long)); +} + + +std::streamsize Connection_mzMLb::write(const std::string& id, const void* buf, std::streamsize n, hid_t native_format, hid_t format, size_t bytes) +{ + Stream& stream = binary_[id]; + if (!stream.dataset) + { + // create dataset + hid_t dcpl = H5Pcreate(H5P_DATASET_CREATE); + { + hsize_t cdims = chunk_size_ / bytes; + H5Pset_chunk(dcpl, 1, &cdims); + + if (compression_level_ > 0) + { + hsize_t level = compression_level_; + if (bytes > 1) H5Pset_shuffle(dcpl); + H5Pset_deflate(dcpl, level); + } + H5Pset_fletcher32(dcpl); + hsize_t maxdims = H5S_UNLIMITED; + stream.size = n; + stream.space = H5Screate_simple(1, &stream.size, &maxdims); + stream.dataset = H5Dcreate(file_, id.c_str(), format, stream.space, H5P_DEFAULT, dcpl, H5P_DEFAULT); + stream.format = format; + } + H5Pclose(dcpl); + } + else + { + // extend dataset size if needed + if (stream.pos + n > stream.size) + { + stream.size = stream.pos + n; + H5Dset_extent(stream.dataset, &stream.size); + H5Sclose(stream.space); + stream.space = H5Dget_space(stream.dataset); + } + } + + + // write + hsize_t count = n; + H5Sselect_hyperslab(stream.space, H5S_SELECT_SET, &stream.pos, NULL, &count, NULL); + hid_t mspace = H5Screate_simple(1, &count, &count); + { + H5Dwrite(stream.dataset, native_format, mspace, stream.space, H5P_DEFAULT, buf); + } + H5Sclose(mspace); + + stream.pos += n; + + return n; +} + + +stream_offset Connection_mzMLb::seek(const std::string& id, stream_offset off, std::ios_base::seekdir way) +{ + Stream& stream = binary_[id]; + + switch (way) + { + case std::ios_base::beg: + stream.pos = off; + break; + case std::ios_base::cur: + stream.pos += off; + break; + case std::ios_base::end: + default: + stream.pos = stream.size - off; + break; + } + + return stream.pos; +} + +} // mzmlb +} // msdata +} // pwiz diff --git a/pwiz/data/msdata/mzmlb/Connection_mzMLb.hpp b/pwiz/data/msdata/mzmlb/Connection_mzMLb.hpp new file mode 100644 index 0000000000..a23ecf9636 --- /dev/null +++ b/pwiz/data/msdata/mzmlb/Connection_mzMLb.hpp @@ -0,0 +1,102 @@ +// +// $Id: Connection_mzMLb.hpp +// +// +// Original authors: Andrew Dowsey +// +// Copyright 2017 biospi Laboratory, +// University of Bristol, UK +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + +#ifndef CONNECTION_MZMLB_HPP_ +#define CONNECTION_MZMLB_HPP_ + + +#include +#include // seekable_device +#include +#include +#include +#include + + +namespace pwiz { +namespace msdata { +namespace mzmlb { + +using namespace boost::iostreams; + +class Connection_mzMLb : public device { +public: + Connection_mzMLb(const std::string& filename, int chunk_size, int compression_level); // open for writing + Connection_mzMLb(const std::string& filename, bool identifyOnly = false); // open for reading or identify() + void close(); // close (called by boost stream on final destruction) + + // boost device methods for reading/writing text to "mzML" dataset + std::streamsize read(char* s, std::streamsize n); + std::streamsize write(const char* s, std::streamsize n); + stream_offset seek(stream_offset off, std::ios_base::seekdir way); + + // methods for reading/writing other binary datasets + bool exists(const std::string& id); + std::streamsize size(const std::string& id); + + std::streamsize read_opaque(const std::string& id, void* buf, std::streamsize n); + std::streamsize read(const std::string& id, char* buf, std::streamsize n); + std::streamsize read(const std::string& id, double* buf, std::streamsize n); + std::streamsize read(const std::string& id, long* buf, std::streamsize n); + std::streamsize read(const std::string& id, long long* buf, std::streamsize n); + + std::streamsize write_opaque(const std::string& id, const void* buf, std::streamsize n); + std::streamsize write(const std::string& id, const char* buf, std::streamsize n); + std::streamsize write(const std::string& id, const float* buf, std::streamsize n); + std::streamsize write(const std::string& id, const double* buf, std::streamsize n); + std::streamsize write(const std::string& id, const long* buf, std::streamsize n); + std::streamsize write(const std::string& id, const long long* buf, std::streamsize n); + + stream_offset seek(const std::string& id, stream_offset off, std::ios_base::seekdir way); + + +private: + std::streamsize read(const std::string& id, void* buf, std::streamsize n, hid_t native_format); + std::streamsize write(const std::string& path, const void* buf, std::streamsize n, hid_t native_format, hid_t format, size_t bytes); + + hid_t opaque_id_; + hid_t file_; + unsigned long chunk_size_; + unsigned long compression_level_; + + struct Stream { + hid_t dataset; + hid_t space; + hsize_t pos; + hsize_t size; + hid_t format; + + Stream() : dataset(0), space(0), pos(0), size(0) {} + }; + + Stream& open(const std::string& id, hid_t format); + Stream& create(const std::string& id, hid_t format, size_t bytes); + + Stream mzML_; // stream parameters for "mzML" text dataset + std::map binary_; // stream parameters for binary datasets +}; + +} // mzmlb +} // msdata +} // pwiz + +#endif /* CONNECTION_MZMLB_HPP_ */ diff --git a/pwiz/data/msdata/mzmlb/Jamfile.jam b/pwiz/data/msdata/mzmlb/Jamfile.jam new file mode 100644 index 0000000000..61b3f8c9b0 --- /dev/null +++ b/pwiz/data/msdata/mzmlb/Jamfile.jam @@ -0,0 +1,60 @@ +# +# $Id: Jamfile.jam 5921 2014-03-19 17:34:13Z chambm $ +# +# +# Original authors: Mathias Wilhelm +# Marc Kirchner +# +# Copyright 2011 Proteomics Center +# Children's Hospital Boston, Boston, MA 02135 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +import feature ; + +rule usage-requirements ( properties * ) +{ + local result ; + if msvc in $(properties) + { + # HACK: shared isn't being passed in properties like I'd expect, so I also check the command-line + #if shared in $(properties) || link=shared in [ modules.peek : ARGV ] + #{ + # result += /ext/boost//thread/shared ; + #} + #else + { + result += /ext/boost//thread ; + } + } +} + +local hdf5-src-location = [ feature.get-values : $(options) ] ; +hdf5-src-location ?= $(PWIZ_LIBRARIES_PATH)/hdf5-1.8.7 ; +path-constant HDF5_SOURCE : $(hdf5-src-location[1]) ; +using ext-hdf5 : 1.8.7 : $(HDF5_SOURCE) ; + +lib pwiz_data_msdata_mzmlb + : # sources + Connection_mzMLb.cpp + : # requirements + static + /ext/hdf5//hdf5 + @usage-requirements + : # default-build + : # usage-requirements + . + /ext/hdf5//hdf5 + @usage-requirements + ; diff --git a/pwiz/utility/bindings/CLI/msdata/MSDataFile.hpp b/pwiz/utility/bindings/CLI/msdata/MSDataFile.hpp index c929f686d5..8a828e3bfd 100644 --- a/pwiz/utility/bindings/CLI/msdata/MSDataFile.hpp +++ b/pwiz/utility/bindings/CLI/msdata/MSDataFile.hpp @@ -66,7 +66,7 @@ public ref class MSDataFile : public MSData /// /// supported data formats for write() /// - enum class Format {Format_Text, Format_mzML, Format_mzXML, Format_MGF, Format_MS1, Format_CMS1, Format_MS2, Format_CMS2, Format_MZ5}; + enum class Format {Format_Text, Format_mzML, Format_mzXML, Format_MGF, Format_MS1, Format_CMS1, Format_MS2, Format_CMS2, Format_MZ5, Format_mzMLb}; enum class Precision {Precision_32, Precision_64}; enum class ByteOrder {ByteOrder_LittleEndian, ByteOrder_BigEndian}; diff --git a/pwiz/utility/minimxml/XMLWriter.cpp b/pwiz/utility/minimxml/XMLWriter.cpp index bd10ec2996..87d96880c4 100644 --- a/pwiz/utility/minimxml/XMLWriter.cpp +++ b/pwiz/utility/minimxml/XMLWriter.cpp @@ -87,6 +87,7 @@ class XMLWriter::Impl void characters(const string& text, bool autoEscape); bio::stream_offset position() const; bio::stream_offset positionNext() const; + ostream& getOutputStream() const; private: ostream& os_; @@ -288,6 +289,12 @@ XMLWriter::stream_offset XMLWriter::Impl::positionNext() const } +ostream& XMLWriter::Impl::getOutputStream() const +{ + return os_; +} + + // // XMLWriter forwarding functions // @@ -321,6 +328,7 @@ PWIZ_API_DECL XMLWriter::stream_offset XMLWriter::position() const {return impl_ PWIZ_API_DECL XMLWriter::stream_offset XMLWriter::positionNext() const {return impl_->positionNext();} +PWIZ_API_DECL ostream& XMLWriter::getOutputStream() const {return impl_->getOutputStream();} namespace { diff --git a/pwiz/utility/minimxml/XMLWriter.hpp b/pwiz/utility/minimxml/XMLWriter.hpp index 34c00fecf2..bb63145964 100644 --- a/pwiz/utility/minimxml/XMLWriter.hpp +++ b/pwiz/utility/minimxml/XMLWriter.hpp @@ -127,6 +127,9 @@ class PWIZ_API_DECL XMLWriter /// returns stream position of next element start tag stream_offset positionNext() const; + + // returns output stream for writing binary datasets if mzMLb + std::ostream& getOutputStream() const; private: diff --git a/pwiz/utility/misc/Jamfile.jam b/pwiz/utility/misc/Jamfile.jam index fea500f58e..fa484a6275 100644 --- a/pwiz/utility/misc/Jamfile.jam +++ b/pwiz/utility/misc/Jamfile.jam @@ -105,7 +105,7 @@ lib pwiz_utility_vendor_reader_test_harness pwiz_utility_misc $(PWIZ_ROOT_PATH)/pwiz/data/msdata//pwiz_data_msdata $(PWIZ_ROOT_PATH)/pwiz/analysis/spectrum_processing//pwiz_analysis_spectrum_processing - io_semaphore + #io_semaphore #mt_semaphore ; explicit pwiz_utility_vendor_reader_test_harness ; diff --git a/pwiz/utility/misc/String.cpp b/pwiz/utility/misc/String.cpp index 01b4824484..225e3da963 100644 --- a/pwiz/utility/misc/String.cpp +++ b/pwiz/utility/misc/String.cpp @@ -26,6 +26,7 @@ using boost::spirit::karma::real_policies; using boost::spirit::karma::real_generator; using boost::spirit::karma::int_generator; +using boost::spirit::karma::uint_generator; using boost::spirit::karma::generate; template @@ -134,4 +135,13 @@ std::string pwiz::util::toString(int value) char* p = buffer; generate(p, intgen, value); return std::string(&buffer[0], p); +} + +std::string pwiz::util::toString(size_t value) +{ + static const uint_generator intgen = uint_generator(); + char buffer[256]; + char* p = buffer; + generate(p, intgen, value); + return std::string(&buffer[0], p); } \ No newline at end of file diff --git a/pwiz/utility/misc/String.hpp b/pwiz/utility/misc/String.hpp index 99a73a2a13..5308f1c09d 100644 --- a/pwiz/utility/misc/String.hpp +++ b/pwiz/utility/misc/String.hpp @@ -140,6 +140,9 @@ std::string toString(double value, RealConvertPolicy policyFlags = RealConvertPo /// uses boost::spirit::karma to do faster conversion (relative to lexical_cast) of int to string std::string toString(int value); +/// uses boost::spirit::karma to do faster conversion (relative to lexical_cast) of size_t to string +std::string toString(size_t value); + } // namespace util } // namespace pwiz diff --git a/pwiz/utility/misc/VendorReaderTestHarness.cpp b/pwiz/utility/misc/VendorReaderTestHarness.cpp index 6840e312ef..2ec5208381 100644 --- a/pwiz/utility/misc/VendorReaderTestHarness.cpp +++ b/pwiz/utility/misc/VendorReaderTestHarness.cpp @@ -43,6 +43,7 @@ #include "boost/thread/thread.hpp" #include "boost/thread/barrier.hpp" #include "boost/locale/encoding_utf.hpp" +#include "pwiz/data/msdata/Serializer_mzML.hpp" using namespace pwiz::util; @@ -384,6 +385,59 @@ void testRead(const Reader& reader, const string& rawpath, const bfs::path& pare } #endif +#ifndef WITHOUT_MZMLB + // mzML <-> mzMLb + if (findUnicodeBytes(rawpath) == rawpath.end()) + { + if (os_) (*os_) << "mzMLb serialization test of " << config.resultFilename(msd.run.id + ".mzML") << endl; + string targetResultFilename_mzMLb = bfs::change_extension(targetResultFilename, ".mzMLb").string(); + { + MSDataFile::WriteConfig config_mzMLb(MSDataFile::Format_mzMLb); + config_mzMLb.binaryDataEncoderConfig.compression = BinaryDataEncoder::Compression_Zlib; + { + MSDataFile::write(vendorMsd, targetResultFilename_mzMLb, config_mzMLb); + MSDataFile msd_mzMLb(targetResultFilename_mzMLb); + msd_mzMLb.fileDescription.sourceFilePtrs.erase(msd_mzMLb.fileDescription.sourceFilePtrs.end() - 1); + + DiffConfig diffConfig_mzMLb(diffConfig); + diffConfig_mzMLb.ignoreDataProcessing = true; + Diff diff_mzMLb(vendorMsd, msd_mzMLb, diffConfig_mzMLb); + if (diff_mzMLb) cerr << headDiff(diff_mzMLb, 5000) << endl; + unit_assert(!diff_mzMLb); + } + + config_mzMLb.binaryDataEncoderConfig.numpressSlofErrorTolerance = 1e-6; + //config_mzMLb.binaryDataEncoderConfig.numpressFixedPoint = 1e-5; + config_mzMLb.binaryDataEncoderConfig.numpressLinearErrorTolerance = 1e-5; + config_mzMLb.binaryDataEncoderConfig.numpressLinearAbsMassAcc = 1e-5; + for (int i = BinaryDataEncoder::Numpress_Linear; i <= (int) BinaryDataEncoder::Numpress_Slof; ++i) + { + config_mzMLb.binaryDataEncoderConfig.numpressOverrides.clear(); + auto numpress = static_cast(i); + if (numpress == BinaryDataEncoder::Numpress_Pic) + config_mzMLb.binaryDataEncoderConfig.numpressOverrides[MS_intensity_array] = numpress; + else + config_mzMLb.binaryDataEncoderConfig.numpress = numpress; + + MSDataFile::write(vendorMsd, targetResultFilename_mzMLb, config_mzMLb); + MSDataFile msd_mzMLb(targetResultFilename_mzMLb); + msd_mzMLb.fileDescription.sourceFilePtrs.erase(msd_mzMLb.fileDescription.sourceFilePtrs.end() - 1); + + DiffConfig diffConfig_mzMLb(diffConfig); + if (numpress == BinaryDataEncoder::Numpress_Pic) + diffConfig_mzMLb.precision = 1.0; // the precision diff is relative: (1 - 0.5)/0.5 + else + diffConfig_mzMLb.precision = 0.05; + diffConfig_mzMLb.ignoreDataProcessing = true; + Diff diff_mzMLb(vendorMsd, msd_mzMLb, diffConfig_mzMLb); + if (diff_mzMLb) cerr << headDiff(diff_mzMLb, 5000) << endl; + unit_assert(!diff_mzMLb); + } + } + bfs::remove(targetResultFilename_mzMLb); + } +#endif + // test SpectrumList::min_level_accepted() for vendors if (msd.run.spectrumListPtr && msd.run.spectrumListPtr->size() > 0) { @@ -414,6 +468,27 @@ void testRead(const Reader& reader, const string& rawpath, const bfs::path& pare bal::contains(fileType, "T2D")) diffConfig_non_mzML.ignoreIdentity = true; + if (msd.run.spectrumListPtr && config.preferOnlyMsLevel != 2) + { + // mzML <-> MGF + vendorMsd.run.spectrumListPtr = SpectrumListPtr(new SpectrumList_MGF_Filter(vendorMsd.run.spectrumListPtr)); + MSData msd_MGF; + Serializer_MGF serializer_MGF; + serializer_MGF.write(*stringstreamPtr, vendorMsd); + if (os_) *os_ << "MGF:\n" << stringstreamPtr->str() << endl; + serializer_MGF.read(serializedStreamPtr, msd_MGF); + + diffConfig_non_mzML.ignoreIdentity = true; + Diff diff_MGF(vendorMsd, msd_MGF, diffConfig_non_mzML); + if (diff_MGF && !os_) cerr << "MGF:\n" << headStream(*serializedStreamPtr, 5000) << endl; + if (diff_MGF) cerr << headDiff(diff_MGF, 5000) << endl; + unit_assert(!diff_MGF); + } + + stringstreamPtr->str(" "); + stringstreamPtr->clear(); + stringstreamPtr->seekp(0); + if (vendorMsd.run.spectrumListPtr) { // mzML <-> mzXML @@ -434,27 +509,6 @@ void testRead(const Reader& reader, const string& rawpath, const bfs::path& pare if (diff_mzXML) cerr << headDiff(diff_mzXML, 5000) << endl; unit_assert(!diff_mzXML); } - - stringstreamPtr->str(" "); - stringstreamPtr->clear(); - stringstreamPtr->seekp(0); - - if (msd.run.spectrumListPtr && config.preferOnlyMsLevel != 2) - { - // mzML <-> MGF - vendorMsd.run.spectrumListPtr = SpectrumListPtr(new SpectrumList_MGF_Filter(vendorMsd.run.spectrumListPtr)); - MSData msd_MGF; - Serializer_MGF serializer_MGF; - serializer_MGF.write(*stringstreamPtr, vendorMsd); - if (os_) *os_ << "MGF:\n" << stringstreamPtr->str() << endl; - serializer_MGF.read(serializedStreamPtr, msd_MGF); - - diffConfig_non_mzML.ignoreIdentity = true; - Diff diff_MGF(vendorMsd, msd_MGF, diffConfig_non_mzML); - if (diff_MGF && !os_) cerr << "MGF:\n" << headStream(*serializedStreamPtr, 5000) << endl; - if (diff_MGF) cerr << headDiff(diff_MGF, 5000) << endl; - unit_assert(!diff_MGF); - } } msds.clear(); diff --git a/pwiz_tools/MSConvertGUI/MainForm.Designer.cs b/pwiz_tools/MSConvertGUI/MainForm.Designer.cs index 7a2bfddc07..59727d9a98 100644 --- a/pwiz_tools/MSConvertGUI/MainForm.Designer.cs +++ b/pwiz_tools/MSConvertGUI/MainForm.Designer.cs @@ -1553,6 +1553,7 @@ private void InitializeComponent() "mzML", "mzXML", "mz5", + "mzMLb", "mgf", "text", "ms1", diff --git a/pwiz_tools/MSConvertGUI/MainForm.cs b/pwiz_tools/MSConvertGUI/MainForm.cs index a3ece2c26e..cf6c6ca973 100644 --- a/pwiz_tools/MSConvertGUI/MainForm.cs +++ b/pwiz_tools/MSConvertGUI/MainForm.cs @@ -568,7 +568,6 @@ private void AddFilterButton_Click(object sender, EventArgs e) String optimizationArgs = DemuxTypeBox.Text == "Overlap Only" ? " optimization=overlap_only" : String.Empty; demuxArgs += optimizationArgs; } - if (!String.IsNullOrEmpty(DemuxMassErrorValue.Text) && !String.IsNullOrEmpty(DemuxMassErrorTypeBox.Text)) { @@ -753,6 +752,9 @@ private String ConstructCommandline() case "mzXML": commandLine.Append("--mzXML|"); break; + case "mzMLb": + commandLine.Append("--mzMLb|"); + break; case "mz5": commandLine.Append("--mz5|"); break; @@ -851,6 +853,7 @@ private void SetControlsFromCommandline(string commandLine) OutputExtension = words[++i]; break; case "--mzXML": + case "--mzMLb": case "--mz5": case "--mgf": case "--text": diff --git a/pwiz_tools/MSConvertGUI/MainLogic.cs b/pwiz_tools/MSConvertGUI/MainLogic.cs index fe07862c4f..7233ef4cfa 100644 --- a/pwiz_tools/MSConvertGUI/MainLogic.cs +++ b/pwiz_tools/MSConvertGUI/MainLogic.cs @@ -132,6 +132,7 @@ public Config ParseCommandLine(string outputFolder, string argv) var formatText = false; var formatMzMl = false; + var formatMzMlB = false; var formatMzXml = false; var formatMz5 = false; var formatMgf = false; @@ -169,6 +170,9 @@ public Config ParseCommandLine(string outputFolder, string argv) case "--mzML": formatMzMl = true; break; + case "--mzMLb": + formatMzMlB = true; + break; case "--mzXML": formatMzXml = true; break; @@ -358,6 +362,7 @@ public Config ParseCommandLine(string outputFolder, string argv) + (formatMzMl ? 1 : 0) + (formatMzXml ? 1 : 0) + (formatMz5 ? 1 : 0) + + (formatMzMlB ? 1 : 0) + (formatMgf ? 1 : 0) + (formatMs1 ? 1 : 0) + (formatCms1 ? 1 : 0) @@ -368,6 +373,7 @@ public Config ParseCommandLine(string outputFolder, string argv) if (formatMzMl) config.WriteConfig.format = MSDataFile.Format.Format_mzML; if (formatMzXml) config.WriteConfig.format = MSDataFile.Format.Format_mzXML; if (formatMz5) config.WriteConfig.format = MSDataFile.Format.Format_MZ5; + if (formatMzMlB) config.WriteConfig.format = MSDataFile.Format.Format_mzMLb; if (formatMgf) config.WriteConfig.format = MSDataFile.Format.Format_MGF; if (formatMs1) config.WriteConfig.format = MSDataFile.Format.Format_MS1; if (formatCms1) config.WriteConfig.format = MSDataFile.Format.Format_CMS1; @@ -392,6 +398,9 @@ public Config ParseCommandLine(string outputFolder, string argv) case MSDataFile.Format.Format_MZ5: config.Extension = ".mz5"; break; + case MSDataFile.Format.Format_mzMLb: + config.Extension = ".mzMLb"; + break; case MSDataFile.Format.Format_MGF: config.Extension = ".mgf"; break; @@ -529,6 +538,7 @@ void processFile(string filename, Config config, ReaderList readers, Map$(PWIZ_RO "$(>)" --doc > "$(<)" } + exe msdir : msdir.cpp /ext/boost//program_options diff --git a/pwiz_tools/commandline/msconvert.cpp b/pwiz_tools/commandline/msconvert.cpp index 83cd56ce29..00b8cc75a3 100644 --- a/pwiz_tools/commandline/msconvert.cpp +++ b/pwiz_tools/commandline/msconvert.cpp @@ -108,6 +108,7 @@ string Config::outputFilename(const string& filename, const MSData& msd) const extension == ".cms1" || extension == ".ms2" || extension == ".cms2" || + extension == ".mzmlb" || extension == ".mz5") runId = bfs::basename(runId); } @@ -150,15 +151,15 @@ ostream& operator<<(ostream& os, const Config& config) static double string_to_double( const std::string& str ) { - errno = 0; - const char* stringToConvert = str.c_str(); - const char* endOfConversion = stringToConvert; - double value = STRTOD( stringToConvert, const_cast(&endOfConversion) ); - if( value == 0.0 && stringToConvert == endOfConversion ) // error: conversion could not be performed - throw bad_lexical_cast(); // not a double - if (*endOfConversion) - throw bad_lexical_cast(); // started out as a double but became something else - like "10foo.raw" - return value; + errno = 0; + const char* stringToConvert = str.c_str(); + const char* endOfConversion = stringToConvert; + double value = STRTOD( stringToConvert, const_cast(&endOfConversion) ); + if( value == 0.0 && stringToConvert == endOfConversion ) // error: conversion could not be performed + throw bad_lexical_cast(); // not a double + if (*endOfConversion) + throw bad_lexical_cast(); // started out as a double but became something else - like "10foo.raw" + return value; } void ShowExamples(ostringstream &usage) @@ -259,6 +260,9 @@ Config parseCommandLine(int argc, char** argv) bool format_CMS1 = false; bool format_MS2 = false; bool format_CMS2 = false; + bool format_mzMLb = false; + int mzMLb_compression_level = 0; + int mzMLb_chunk_size = 0; bool format_mz5 = false; bool precision_32 = false; bool precision_64 = false; @@ -266,19 +270,25 @@ Config parseCommandLine(int argc, char** argv) bool mz_precision_64 = false; bool intensity_precision_32 = false; bool intensity_precision_64 = false; + int mz_truncation = 0; + int intensity_truncation = 0; + bool mz_delta = false; + bool intensity_delta = false; + bool mz_linear = false; + bool intensity_linear = false; bool noindex = false; bool zlib = false; bool gzip = false; bool ms_numpress_all = false; // if true, use this numpress compression with default tolerance double ms_numpress_linear = -1; // if >= 0, use this numpress linear compression with this tolerance - string ms_numpress_linear_str; // input as text, to help with the "msconvert --numpresslinear foo.raw" case + string ms_numpress_linear_str; // input as text, to help with the "msconvert --numpresslinear foo.raw" case string ms_numpress_linear_default = (boost::format("%4.2g") % BinaryDataEncoder_default_numpressLinearErrorTolerance).str(); bool ms_numpress_pic = false; // if true, use this numpress Pic compression double ms_numpress_slof = -1; // if >= 0, use this numpress slof compression with this tolerance double ms_numpress_linear_abs_tolerance = -1; // if >= 0, use this numpress linear compression with this absolute Th tolerance - string ms_numpress_linear_abs_tolerance_str; // input as text - string ms_numpress_linear_abs_tolerance_str_default("-1"); // input as text - string ms_numpress_slof_str; // input as text, to help with the "msconvert --numpressslof foo.raw" case + string ms_numpress_linear_abs_tolerance_str; // input as text + string ms_numpress_linear_abs_tolerance_str_default("-1"); // input as text + string ms_numpress_slof_str; // input as text, to help with the "msconvert --numpressslof foo.raw" case string ms_numpress_slof_default = (boost::format("%4.2g") % BinaryDataEncoder_default_numpressSlofErrorTolerance).str(); string runIndexSet; bool detailedHelp = false; @@ -307,6 +317,9 @@ Config parseCommandLine(int argc, char** argv) ": set extension for output files [mzML|mzXML|mgf|txt" #ifndef WITHOUT_MZ5 "|mz5" +#endif +#ifndef WITHOUT_MZMLB + "|mzMLb" #endif "]") ("mzML", @@ -319,6 +332,17 @@ Config parseCommandLine(int argc, char** argv) ("mz5", po::value(&format_mz5)->zero_tokens(), ": write mz5 format") +#endif +#ifndef WITHOUT_MZMLB + ("mzMLb", + po::value(&format_mzMLb)->zero_tokens(), + ": write mzMLb format") + ("mzMLbChunkSize", + po::value(&mzMLb_chunk_size)->default_value(1048576), + ": mzMLb dataset chunk size in bytes") + ("mzMLbCompressionLevel", + po::value(&mzMLb_compression_level)->default_value(0), + ": mzMLb GZIP compression level (0-9)") #endif ("mgf", po::value(&format_MGF)->zero_tokens(), @@ -359,6 +383,24 @@ Config parseCommandLine(int argc, char** argv) ("inten32", po::value(&intensity_precision_32)->zero_tokens(), ": encode intensity values in 32-bit precision [default]") + ("mzTruncation", + po::value(&mz_truncation)->default_value(0), + ": Number of mantissa precision bits to truncate for mz and rt, or if -1 will store integers only") + ("intenTruncation", + po::value(&intensity_truncation)->default_value(0), + ": Number of mantissa precision bits to truncate for intensities, or if -1 will store integers only") + ("mzDelta", + po::value(&mz_delta)->zero_tokens(), + ": apply delta prediction to mz and rt values") + ("intenDelta", + po::value(&intensity_delta)->zero_tokens(), + ": apple delta prediction to intensity values") + ("mzLinear", + po::value(&mz_linear)->zero_tokens(), + ": apply linear prediction to mz and rt values") + ("intenLinear", + po::value(&intensity_linear)->zero_tokens(), + ": apply linear prediction to intensity values") ("noindex", po::value(&noindex)->zero_tokens(), ": do not write index") @@ -581,37 +623,37 @@ Config parseCommandLine(int argc, char** argv) // check stuff - if (ms_numpress_slof_str.length()) // was that a numerical arg to --numpressSlof, or a filename? - { - try - { - ms_numpress_slof = string_to_double(ms_numpress_slof_str); - } - catch(...) { - config.filenames.push_back(ms_numpress_slof_str); // actually that was a filename + if (ms_numpress_slof_str.length()) // was that a numerical arg to --numpressSlof, or a filename? + { + try + { + ms_numpress_slof = string_to_double(ms_numpress_slof_str); + } + catch(...) { + config.filenames.push_back(ms_numpress_slof_str); // actually that was a filename ms_numpress_slof = BinaryDataEncoder_default_numpressSlofErrorTolerance; - } - } - if (ms_numpress_linear_abs_tolerance_str.length()) // this argument needs to be numerical - { - ms_numpress_linear_abs_tolerance = string_to_double(ms_numpress_linear_abs_tolerance_str); - } - if (ms_numpress_linear_str.length()) // was that a numerical arg to --numpressLinear, or a filename? - { - try - { - ms_numpress_linear = string_to_double(ms_numpress_linear_str); - } - catch(...) { - config.filenames.push_back(ms_numpress_linear_str); // actually that was a filename + } + } + if (ms_numpress_linear_abs_tolerance_str.length()) // this argument needs to be numerical + { + ms_numpress_linear_abs_tolerance = string_to_double(ms_numpress_linear_abs_tolerance_str); + } + if (ms_numpress_linear_str.length()) // was that a numerical arg to --numpressLinear, or a filename? + { + try + { + ms_numpress_linear = string_to_double(ms_numpress_linear_str); + } + catch(...) { + config.filenames.push_back(ms_numpress_linear_str); // actually that was a filename ms_numpress_linear = BinaryDataEncoder_default_numpressLinearErrorTolerance; - } - } + } + } if (config.filenames.empty()) throw user_error("[msconvert] No files specified."); - int count = format_text + format_mzML + format_mzXML + format_MGF + format_MS2 + format_CMS2 + format_mz5; + int count = format_text + format_mzML + format_mzXML + format_MGF + format_MS2 + format_CMS2 + format_mz5 + format_mzMLb; if (count > 1) throw user_error("[msconvert] Multiple format flags specified."); if (format_text) config.writeConfig.format = MSDataFile::Format_Text; if (format_mzML) config.writeConfig.format = MSDataFile::Format_mzML; @@ -622,6 +664,8 @@ Config parseCommandLine(int argc, char** argv) if (format_MS2) config.writeConfig.format = MSDataFile::Format_MS2; if (format_CMS2) config.writeConfig.format = MSDataFile::Format_CMS2; if (format_mz5) config.writeConfig.format = MSDataFile::Format_MZ5; + if (format_mzMLb) config.writeConfig.format = MSDataFile::Format_mzMLb; + config.writeConfig.gzipped = gzip; // if true, file is written as .gz @@ -653,6 +697,12 @@ Config parseCommandLine(int argc, char** argv) case MSDataFile::Format_CMS2: config.extension = ".cms2"; break; + case MSDataFile::Format_mzMLb: +#ifdef WITHOUT_MZMLB + throw user_error("[msconvert] Not built with mzMLb support."); +#endif + config.extension = ".mzMLb"; + break; case MSDataFile::Format_MZ5: #ifdef WITHOUT_MZ5 throw user_error("[msconvert] Not built with mz5 support."); @@ -668,6 +718,23 @@ Config parseCommandLine(int argc, char** argv) } } + // handle prediction flags + if (intensity_delta) + config.writeConfig.binaryDataEncoderConfig.predictionOverrides[MS_intensity_array] = BinaryDataEncoder::Prediction_Delta; + if (mz_delta) + { + config.writeConfig.binaryDataEncoderConfig.predictionOverrides[MS_m_z_array] = BinaryDataEncoder::Prediction_Delta; + config.writeConfig.binaryDataEncoderConfig.predictionOverrides[MS_time_array] = BinaryDataEncoder::Prediction_Delta; + } + if (intensity_linear) + config.writeConfig.binaryDataEncoderConfig.predictionOverrides[MS_intensity_array] = BinaryDataEncoder::Prediction_Linear; + if (mz_linear) + { + config.writeConfig.binaryDataEncoderConfig.predictionOverrides[MS_m_z_array] = BinaryDataEncoder::Prediction_Linear; + config.writeConfig.binaryDataEncoderConfig.predictionOverrides[MS_time_array] = BinaryDataEncoder::Prediction_Linear; + } + + // precision defaults config.writeConfig.binaryDataEncoderConfig.precision = BinaryDataEncoder::Precision_64; @@ -705,13 +772,26 @@ Config parseCommandLine(int argc, char** argv) if (intensity_precision_64) config.writeConfig.binaryDataEncoderConfig.precisionOverrides[MS_intensity_array] = BinaryDataEncoder::Precision_64; + if (mz_truncation != 0) + config.writeConfig.binaryDataEncoderConfig.truncationOverrides[MS_m_z_array] = mz_truncation; + if (intensity_truncation != 0) + config.writeConfig.binaryDataEncoderConfig.truncationOverrides[MS_intensity_array] = intensity_truncation; + // other flags if (noindex) config.writeConfig.indexed = false; - if (zlib) + if (zlib || mzMLb_compression_level > 0) + { config.writeConfig.binaryDataEncoderConfig.compression = BinaryDataEncoder::Compression_Zlib; + if (mzMLb_compression_level == 0) + config.writeConfig.mzMLb_compression_level = 4; + else + config.writeConfig.mzMLb_compression_level = mzMLb_compression_level; + } + + config.writeConfig.mzMLb_chunk_size = mzMLb_chunk_size; if ((ms_numpress_slof>=0) && ms_numpress_pic) throw user_error("[msconvert] Incompatible compression flags 'numpressPic' and 'numpressSlof'."); diff --git a/pwiz_tools/examples/Jamfile.jam b/pwiz_tools/examples/Jamfile.jam index 70748bdd46..0a11669281 100644 --- a/pwiz_tools/examples/Jamfile.jam +++ b/pwiz_tools/examples/Jamfile.jam @@ -58,6 +58,14 @@ exe mscat ; +exe mzmlbcat + : mzmlbcat.cpp + /ext/boost//program_options + /ext/boost//filesystem + : ../../.. ../commandline//mstool-requirements + ; + + exe txt2mzml : txt2mzml.cpp ../../pwiz/data/msdata//pwiz_data_msdata @@ -131,6 +139,7 @@ install bin hello_analyzer hello_analyzer_2 mscat + mzmlbcat msbenchmark txt2mzml write_example_files diff --git a/pwiz_tools/examples/mzmlbcat.cpp b/pwiz_tools/examples/mzmlbcat.cpp new file mode 100644 index 0000000000..e101d96135 --- /dev/null +++ b/pwiz_tools/examples/mzmlbcat.cpp @@ -0,0 +1,344 @@ +// +// $Id: mzmlbcat.cpp +// +// +// Original authors: Andrew Dowsey +// +// Copyright 2017 biospi Laboratory, +// University of Bristol, UK +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// + + +#include "pwiz/Version.hpp" +#include "pwiz/data/msdata/Version.hpp" +#include "pwiz/analysis/Version.hpp" +#include "boost/program_options.hpp" +#include "boost/format.hpp" +#include "pwiz/utility/misc/Filesystem.hpp" +#include "pwiz/utility/misc/Std.hpp" +#include +#ifndef WITHOUT_MZMLB +#include "pwiz/data/msdata/mzmlb/Connection_mzMLb.hpp" +#endif + + +using namespace pwiz::util; +using namespace pwiz::msdata::mzmlb; + + +/// Holds the results of the parseCommandLine function. +struct Config +{ + vector filenames; + bool metadata; + int spectrum; + int chromatogram; + + Config() + : metadata(false), spectrum(-1), chromatogram(-1) + { + } +}; + + +/// Parses command line arguments to setup execution parameters, and +/// contructs help text. Inputs are the arguments to main. A filled +/// Config object is returned. +Config parseCommandLine(int argc, char** argv) +{ + namespace po = boost::program_options; + + ostringstream usage; + usage << "Usage: mzmlbcat [options] [filemasks]\n" + << "Cat mzML contained within mzMLb file(s).\n" + << "\n" + << "Return value: # of failed files.\n" + << "\n"; + + Config config; + string filelistFilename; + string configFilename; + + bool detailedHelp = false; + bool metadata = false; + int spectrum = -1; + int chromatogram = -1; + + pair consoleBounds = get_console_bounds(); // get platform-specific console bounds, or default values if an error occurs + + po::options_description od_config("Options", consoleBounds.first); + od_config.add_options() + ("metadata", + po::value(&metadata)->zero_tokens(), + ": output only the metadata (no spectra or chromatograms)") + ("spectrum", + po::value(&spectrum)->default_value(-1), + ": output a spectrum") + ("chromatogram", + po::value(&chromatogram)->default_value(-1), + ": output a chromatogram") + ("help", + po::value(&detailedHelp)->zero_tokens(), + ": show this message") + ; + + // handle positional arguments + + const char* label_args = "args"; + + po::options_description od_args; + od_args.add_options()(label_args, po::value< vector >(), ""); + + po::positional_options_description pod_args; + pod_args.add(label_args, -1); + + po::options_description od_parse; + od_parse.add(od_config).add(od_args); + + // parse command line + + po::variables_map vm; + po::store(po::command_line_parser(argc, (char**)argv). + options(od_parse).positional(pod_args).run(), vm); + po::notify(vm); + + // append options description to usage string + + usage << od_config + << endl + << "Examples:\n" + << endl + << "# output metadata in data.mzMLb\n" + << "mzmlbcat --metadata data.mzMLb\n" + << endl + << endl + + << "Questions, comments, and bug reports:\n" + << "http://www.biospi.org\n" + << "andrew.dowsey@bristol.ac.uk\n" + << "\n" + << "ProteoWizard release: " << pwiz::Version::str() << " (" << pwiz::Version::LastModified() << ")" << endl + << "ProteoWizard MSData: " << pwiz::msdata::Version::str() << " (" << pwiz::msdata::Version::LastModified() << ")" << endl + << "ProteoWizard Analysis: " << pwiz::analysis::Version::str() << " (" << pwiz::analysis::Version::LastModified() << ")" << endl + << "Build date: " << __DATE__ << " " << __TIME__ << endl; + + if ((argc <= 1) || detailedHelp) + throw usage_exception(usage.str().c_str()); + + // remember filenames from command line + + if (vm.count(label_args)) + { + config.filenames = vm[label_args].as< vector >(); + + // expand the filenames by globbing to handle wildcards + vector globbedFilenames; + BOOST_FOREACH(const string& filename, config.filenames) + if (expand_pathmask(bfs::path(filename), globbedFilenames) == 0) + cout << "[msconvert] no files found matching \"" << filename << "\"" << endl; + + config.filenames.clear(); + BOOST_FOREACH(const bfs::path& filename, globbedFilenames) + config.filenames.push_back(filename.string()); + } + + // parse filelist if required + + if (!filelistFilename.empty()) + { + ifstream is(filelistFilename.c_str()); + while (is) + { + string filename; + getline(is, filename); + if (is) config.filenames.push_back(filename); + } + } + + // check stuff + config.metadata = metadata; + config.spectrum = spectrum; + config.chromatogram = chromatogram; + + return config; +} + + +int go(const Config& config) +{ + using boost::iostreams::stream_offset; + using std::streamsize; + int failedFileCount = 0; + + for (vector::const_iterator it = config.filenames.begin(); + it != config.filenames.end(); ++it) + { + try + { +#ifndef WITHOUT_MZMLB + boost::iostreams::stream* is = new boost::iostreams::stream(Connection_mzMLb(*it)); + if (!is->get() || !*is) + throw runtime_error(("[go::read] Unable to open file " + *it).c_str()); + is->unget(); + + char buf[4097]; + if (config.metadata) + { + std::vector spectrumPositions((*is)->size("mzML_spectrumIndex")); + (*is)->read("mzML_spectrumIndex", &spectrumPositions[0], spectrumPositions.size()); + std::vector chromatogramPositions((*is)->size("mzML_chromatogramIndex")); + (*is)->read("mzML_chromatogramIndex", &chromatogramPositions[0], chromatogramPositions.size()); + + { // before spectra + stream_offset n = spectrumPositions.front(); + while (n > 0) + { + is->read(buf, n < 4096 ? n : 4096); + n -= is->gcount(); + buf[is->gcount()] = 0; + std::cout << buf; + } + } + { // between spectra and chromatograms + is->seekg(spectrumPositions.back(), std::ios_base::beg); + stream_offset n = chromatogramPositions.front() - spectrumPositions.back(); + while (n > 0) + { + is->read(buf, n < 4096 ? n : 4096); + n -= is->gcount(); + buf[is->gcount()] = 0; + std::cout << buf; + } + } + { // after chromatograms to end + is->seekg(chromatogramPositions.back(), std::ios_base::beg); + while (true) + { + is->read(buf, 4096); + if (is->gcount() <= 0) break; + buf[is->gcount()] = 0; + std::cout << buf; + } + } + } + else if (config.spectrum >= 0) + { + std::vector spectrumPositions((*is)->size("mzML_spectrumIndex")); + (*is)->read("mzML_spectrumIndex", &spectrumPositions[0], spectrumPositions.size()); + if (config.spectrum >= spectrumPositions.size() - 1) break; + + // output spectrum + is->seekg(spectrumPositions[config.spectrum], std::ios_base::beg); + stream_offset n = spectrumPositions[config.spectrum + 1] - spectrumPositions[config.spectrum]; + while (n > 0) + { + is->read(buf, n < 4096 ? n : 4096); + n -= is->gcount(); + buf[is->gcount()] = 0; + std::cout << buf; + } + } + else if (config.chromatogram >= 0) + { + std::vector chromatogramPositions((*is)->size("mzML_chromatogramIndex")); + (*is)->read("mzML_chromatogramIndex", &chromatogramPositions[0], chromatogramPositions.size()); + if (config.chromatogram >= chromatogramPositions.size() - 1) break; + + // output chromatogram + is->seekg(chromatogramPositions[config.chromatogram], std::ios_base::beg); + stream_offset n = chromatogramPositions[config.chromatogram + 1] - chromatogramPositions[config.chromatogram]; + while (n > 0) + { + is->read(buf, n < 4096 ? n : 4096); + n -= is->gcount(); + buf[is->gcount()] = 0; + std::cout << buf; + } + } + else + { // dump whole mzML + while (true) + { + is->read(buf, 4096); + if (is->gcount() <= 0) break; + buf[is->gcount()] = 0; + std::cout << buf; + } + } + + delete is; +#endif + } + catch (exception& e) + { + failedFileCount++; + cerr << e.what() << endl; + cerr << "Error processing file " << *it << "\n\n"; + } + } + + return failedFileCount; +} + + +int main(int argc, char* argv[]) +{ + bnw::args args(argc, argv); + + std::locale global_loc = std::locale(); + std::locale loc(global_loc, new bfs::detail::utf8_codecvt_facet); + bfs::path::imbue(loc); + + try + { + Config config = parseCommandLine(argc, argv); + + return go(config); + } + catch (usage_exception& e) + { + cerr << e.what() << endl; + return 0; + } + catch (user_error& e) + { + cerr << e.what() << endl; + return 1; + } + catch (boost::program_options::error& e) + { + cerr << "Invalid command-line: " << e.what() << endl; + return 1; + } + catch (exception& e) + { + cerr << e.what() << endl; + } + catch (...) + { + cerr << "[" << argv[0] << "] Caught unknown exception.\n"; + } + + cerr << "Please report this error to andrew.dowsey@bristol.ac.uk.\n" + << "Attach the command output and this version information in your report:\n" + << "\n" + << "ProteoWizard release: " << pwiz::Version::str() << " (" << pwiz::Version::LastModified() << ")" << endl + << "ProteoWizard MSData: " << pwiz::msdata::Version::str() << " (" << pwiz::msdata::Version::LastModified() << ")" << endl + << "ProteoWizard Analysis: " << pwiz::analysis::Version::str() << " (" << pwiz::analysis::Version::LastModified() << ")" << endl + << "Build date: " << __DATE__ << " " << __TIME__ << endl; + + return 1; +} +