Matrix Science Mascot Parser toolkit
 
Loading...
Searching...
No Matches
ms_mascotresfile_msr.hpp
1/*
2##############################################################################
3# file: ms_mascotresfile_msr.hpp #
4# 'msparser' toolkit #
5# Encapsulates a mascot results file #
6##############################################################################
7# COPYRIGHT NOTICE #
8# Copyright 1998-2023 Matrix Science Limited All Rights Reserved. #
9# #
10##############################################################################
11# $Archive:: /MowseBranches/ms_mascotresfile_1.2/include/ms_mascotresfi $ #
12# $Author: villek@matrixscience.com $ #
13# $Date: 2024-09-23 15:56:03 +0100 $ #
14# $Revision: 09156baa48968c2f3ca3f6899bf06cc9caec40b7 | MSPARSER_REL_3_0_0-2024-09-24-0-g93ebaeb4f4 $ #
15# $NoKeywords:: $ #
16##############################################################################
17*/
18
19#ifndef MS_MASCOTRESFILE_MSR_HPP
20#define MS_MASCOTRESFILE_MSR_HPP
21
22
23// Includes from the standard template library
24#include <stdarg.h>
25#include <time.h>
26#include <string>
27#include <list>
28#include <vector>
29#include <set>
30#include <unordered_map>
31#include <array>
32#include <thread>
33#include <atomic>
34
35namespace msparser_internal {
36 class ms_peptidesumsql;
37 class ms_inputquery_msr;
38 class ms_sqlpp11_connection;
39
40#ifndef SWIG
41 namespace msr_cached_data
42 {
43 struct QueryData;
44 struct ResfileCaches
45 {
46 static constexpr int CACHE_SIZE = 10;
47
48 struct SummaryQueryInfoData
49 {
50 //double observedMr; // Stored in summaryQueryInfoL2Cache_
51 double mZ;
52 double intensity;
53 int charge;
54 // int64_t qMatch; // Stored in summaryQueryInfoL2Cache_;
55 //double plughole; // Stored in summaryQueryInfoL2Cache_;
56 };
57
58 // [(queryId, (QueryDataStandard, QueryDataDecoy, ...)), ...]
59 using SummaryQueryInfoCache = std::map<int, std::array<std::pair<int, std::array<SummaryQueryInfoData, matrix_science::ms_peptide::N_PSM_TYPES>>, CACHE_SIZE>>;
60 thread_local static SummaryQueryInfoCache summaryQueryInfoCache_;
61
62 // Store data that is usually not accessed sequentially in a separate long term cache.
63 // e.g. ms_protein::sortPeptides accesses mr in a non sequencial order.
64 struct SummaryQueryInfoHotData
65 {
66 double observedMr;
67 double plughole;
68 int64_t qMatch;
69 };
70
71 // Map queryId -> (SummaryQueryInfoHotData)
72 using SummaryQueryInfoL2Cache = std::map< int, std::unordered_map<int, std::array<SummaryQueryInfoHotData, matrix_science::ms_peptide::N_PSM_TYPES>> >;
73 thread_local static SummaryQueryInfoL2Cache summaryQueryInfoL2Cache_;
74
75 using QueryDataCache = std::map< int, std::array<QueryData, CACHE_SIZE> >;
76 thread_local static QueryDataCache queryCache_;
77 };
78 }
79#endif
80}
81
82namespace matrix_science {
83 class ms_umod_configfile;
84 class ms_masses;
85 class ms_enzymefile;
86 class ms_taxonomyfile;
87 class ms_mascotresults_params;
88
95
104 class MS_MASCOTRESFILE_API ms_mascotresfile_msr : public ms_mascotresfilebase
105 {
106 friend class msparser_internal::ms_peptidesumsql;
107 friend class msparser_internal::ms_inputquery_msr;
108
109 public:
110
112 ms_mascotresfile_msr(const char * szFileName,
113 const int keepAliveInterval = 0,
114 const char * keepAliveText = "<!-- %d seconds -->\n",
115 const unsigned int flags = RESFILE_NOFLAG,
116 const char * cacheDirectory = "../data/cache/%Y/%m",
117 const char * XMLschemaDirectory = 0,
118 ms_progress_info * progressMonitor = 0);
119
120 virtual ~ms_mascotresfile_msr();
121
122 // ------------------- Basic generic functions -------------------
123
124
126 int appendResfile(const char * filename,
127 int flags=RESFILE_USE_PARENT_PARAMS,
128 const char * cacheDirectory = 0) override; // returns 'id' of added file
129
131 const ms_mascotresfile_msr* getResfile(int id) const override;
132
134 int getNumberOfResfiles() const override;
135
137 int getJobNumber(const int resfileID = 1) const override;
138
139 // ----------------- Specific results functions ------------------
141 int getNumQueries(const int resfileID = 0) const override;
142
144 int getNumSeqs(const int idx = 0) const override;
145
147 int getNumSeqsAfterTax(const int idx = 0) const override;
148
150 int getNumEtSeqsSearched(const int idx = 0) const override;
151
153 int getNumLibraryEntries(const int idx = 0) const override;
154
156 double getNumResidues(const int idx = 0) const override;
157
159 bool isDatabaseTypeAvailable() const override;
161 DATABASE_TYPE getDatabaseType(const int idx) const override;
163 int getReferenceDatabaseNumberOfSL(const int idx) const override;
164
166 std::vector<int> getSLDatabaseNumbersOfReference(const int idx) const override;
167
169 int getExecTime() const override;
170
172 int getDate() const override;
173
175 std::string getMascotVer() const override;
176
178 std::string getFastaVer(int idx = 1) const override;
179
181 std::string getFastaPath(int idx = 1) const override;
182
184 std::string getUniqueTaskID() const override;
185
187 ms_mascotoptions::DECOY_ALGORITHM getDecoyTypeForDB(const int idx = 1) const override;
188
190 double getSLFragmentTolerance(int idx = 1) const override;
191
193 std::string getSLFragmentToleranceUnit(int idx = 1) const override;
194
196 virtual std::string getSLExecCommand(int idx = 1) const override;
197
199 bool isPMF() const override;
200
202 bool isMSMS() const override;
203
205 bool isSQ() const override;
206
208 bool isErrorTolerant() const override;
209
211 bool anyPMF() const override;
212
214 bool anyMSMS() const override;
215
217 bool anySQ() const override;
218
220 bool anyTag() const override;
221
223 bool anyFastaMatches(const bool isDecoy=false) const override;
224
226 bool anySpectralLibraryMatches(const bool isDecoy=false) const override;
227
229 bool anyErrorTolerantMatches(const bool isDecoy=false) const override;
230
232 bool anyCrosslinkedMatches(const bool isDecoy=false) const override;
233
235 double getObservedMass(const int query) const override;
236
238 int getObservedCharge(const int query, const bool decoy=false) const override;
239
241 double getObservedMrValue(const int query, const bool decoy=false) const override;
242
244 double getObservedIntensity(const int query) const override;
245
247 std::string getRepeatSearchString(const int query, const bool fullQuery = false) const override;
248
250 std::string getFileName(const int id = 1) const override;
251
253 bool getQuantitation(ms_quant_configfile *qfile) const override;
254
256 bool getUnimod(ms_umod_configfile *ufile, bool useSchemaFromResfile = false) const override;
257
259 bool getUnimodXL(ms_umod_configfile *ufile, bool useSchemaFromResfile = false) const override;
260
262 bool getEnzyme(ms_enzymefile *efile, const char * enzymeFileName = 0) const override;
263
265 bool getTaxonomy(ms_taxonomyfile *tfile) const override;
266
268 std::string getCacheDirectory(bool processed = true) const override;
269
271 std::string getCacheFileName() const override;
272
273
274#ifndef SWIG
276 bool getSrcQueryAndFileIdForMultiFile(const int q, int & gsqNewQuery, int & gsqFileId) const override;
277#else // SWIG Multiple return values
278 bool getSrcQueryAndFileIdForMultiFile(const int q, int & OUTPUT, int & OUTPUT) const override;
279#endif
281 int getMultiFileQueryNumber(const int localQuery, const int fileId) const override;
282
284 bool hasEnzyme() const override;
285
287 bool hasRT() const override;
288
290 bool hasQuantitation() const override;
291
293 ms_inputquery getInputQuery(const int queryNum) const override;
294
295#ifndef SWIG
297 void getHeaderKeyValues(std::vector<std::string> & keys, std::vector<std::string> & values) const override;
298
300 void getSearchParametersKeyValues(std::vector<std::string> & keys, std::vector<std::string> & values) const override;
301
303 void getMassesKeyValues(std::vector<std::string> & keys, std::vector<double> & values) const override;
304#else
305 void getHeaderKeyValues(std::vector<std::string> & OUTPUT, std::vector<std::string> & OUTPUT) const override;
306
307 void getSearchParametersKeyValues(std::vector<std::string> & OUTPUT, std::vector<std::string> & OUTPUT) const override;
308
309 void getMassesKeyValues(std::vector<std::string> & OUTPUT, std::vector<double> & OUTPUT) const override;
310#endif
311
313 virtual std::string getHeaderValue(const std::string& key) const override;
314
316 virtual std::string getSearchParameter(const std::string& key) const override;
317
319 virtual double getMassValue(const std::string& key) const override;
320
322 int getLibraryMods(std::vector<std::string> & modNames, std::vector<double> & modDeltas) const override;
323
325 int64_t getQmatch(const int query, const ms_peptide::PSM_TYPE pepType) const override;
326
328 double getQplughole(const int query, const ms_peptide::PSM_TYPE pepType) const override;
329
331 double getFirstPassThreshold() const override;
332
333#ifndef SWIG
334 msparser_internal::ms_sqlpp11_connection & sqlite3_connection() const { return db_; }
335 std::shared_ptr<msparser_internal::ms_sqlpp11_connection> sqlite3_connection_ptr() const { return msDbPtr_; }
336 msparser_internal::ms_sqlpp11_connection & getSrcQueryAndSqlHandleForMultiFile(const int q, int & gsqNewQuery) const;
337#endif
338
339 protected:
340#ifndef SWIG
341 // Not safe to copy or assign this object.
343 ms_mascotresfile_msr & operator=(const ms_mascotresfile_msr & rhs);
344#endif
345
346 bool getCrosslinking(ms_crosslinking_configfile *crosslinkingFile) const override;
347
348 private:
349
350 void validateResfileVersion() override;
351
352 void propagateKeepAlive(const int keepAliveInterval,
353 const char * keepAliveText,
354 const bool propagateToAppended,
355 const bool resetStartTime) override;
356
357 std::string getPepSumCacheFilename(
358 const unsigned int flags,
359 double minProbability,
360 int maxHitsToReport,
361 const char * unigeneIndexFile,
362 double ignoreIonsScoreBelow,
363 int minPepLenInPepSummary,
364 const char * singleHit,
365 const unsigned int flags2) const override;
366
367 std::string getPepSumCacheFilename(const ms_mascotresults_params & parameters) const override;
368
369 int getNumHitsInProteinSummary(const ms_peptide::PSM_TYPE psmType) const override;
370
371 bool willCreatePepSumCache(const unsigned int flags,
372 double minProbability,
373 int maxHitsToReport,
374 const char * unigeneIndexFile,
375 double ignoreIonsScoreBelow,
376 int minPepLenInPepSummary,
377 const char * singleHit,
378 const unsigned int flags2) const override;
379
380 bool willCreatePepSumCache(const ms_mascotresults_params &parameters) const override;
381
382 bool willCreatePepSumCache(const ms_mascotresults_params & parameters,
383 const ms_mascotoptions & opts,
384 std::string & peptideSummaryCacheFileName,
385 unsigned int & cacheStatus) const override;
386
387 void buildPeptideSummaryImpl(ms_peptidesummary & pepSum,
388 const ms_mascotresults_params & parameters,
389 std::shared_ptr<msparser_internal::ms_mascotresultsbase> & results,
390 std::shared_ptr<msparser_internal::ms_peptidesummarybase> & iPepSum) const override;
391
392 void buildProteinSummaryImpl(ms_proteinsummary & protSum,
393 const ms_mascotresults_params & parameters,
394 std::shared_ptr<msparser_internal::ms_mascotresultsbase> & results,
395 std::shared_ptr<msparser_internal::ms_proteinsummarybase> & iProtSum) const override;
396
397 std::shared_ptr<msparser_internal::ms_inputquerybase> buildInputQueryImpl(const int q) const override;
398
399 std::shared_ptr<msparser_internal::ms_searchparamsbase> buildSearchParamsImpl() const override;
400
401
402 // Cumulative count of queries
403 // e.g.: file1= 10, file2= 5, file3= 4 => [0, 10, 15, 19]
404 std::vector<int> cumNumQueries_;
405 // std::list because it doesn't require its elements to be copy constructible
406 std::list<ms_mascotresfile_msr> appendedFiles_;
407 // Pointers to this and all the appended files
408 std::vector<ms_mascotresfile_msr*> allFilesPtr_;
409
410 std::shared_ptr<msparser_internal::ms_sqlpp11_connection> msDbPtr_;
411 msparser_internal::ms_sqlpp11_connection & db_;
412 std::unordered_map<std::string, std::string> mapHeader_;
413
414 struct DbMetadata
415 {
416 DbMetadata()
417 : fastaFile{},
418 release{},
419 dbType{},
420 decoyType{},
421 nSequences(-1),
422 nSequencesAfterTax(-1),
423 nResidues(-1),
424 etSequences(-1)
425 {};
426
427 std::string fastaFile,
428 release,
429 dbType;
430 int decoyType,
431 nSequences,
432 nSequencesAfterTax;
433 int64_t nResidues;
434 int etSequences;
435 };
436
437 struct DbDataCache
438 {
439 DbDataCache()
440 : valid(false)
441 {}
442
443 bool valid;
444 std::map<int, DbMetadata> data;
445 };
446 mutable DbDataCache dbDataCache_;
447
448 void _cacheQueryInfoData(const int q) const;
449 void _cacheDbData() const;
450
451 // -1: Not set, 0: False, 1: True
452 mutable char anyPMFCache_;
453 mutable char anyMSMSCache_;
454 mutable char anySQCache_;
455 mutable char anyTagCache_;
456
457 mutable std::array<char, ms_peptide::N_PSM_TYPES> hasAnyPepSumMatchesCache_;
458 bool _hasAnyPeptideSummaryMatches(const ms_peptide::PSM_TYPE type) const;
459
460 int _cacheQueryData(const int q) const;
461 int _cacheQuerySeq(const int q) const;
462 int _cacheQueryComp(const int q) const;
463 int _cacheQuerySeqTag(const int q) const;
464
465 std::string processedCacheDir_;
466 std::string fileBaseName_; // File name without path
467
468 // Keep track of the number of ms_mascotresfile_msr objects created in the process
469 static std::atomic<int> resfileCounter_;
470 int id_;
471
472 msparser_internal::msr_cached_data::ResfileCaches caches_;
473
474 }; // end of resfile_group
476} // matrix_science namespace
477#endif // MS_MASCOTRESFILE_MSR_HPP
478
479/*------------------------------- End of File -------------------------------*/
This class represents the file crosslinking.xml.
Definition: ms_crosslinking_configfile.hpp:49
Reads and parses the enzymes file that contains multiple enzyme definitions.
Definition: ms_enzyme.hpp:194
This class encapsulates the input queries (peak lists) in the Mascot results file.
Definition: ms_inputquery.hpp:43
An instance of this class represents all the parameters specified in the Options section of mascot....
Definition: ms_mascotoptions.hpp:91
DECOY_ALGORITHM
Definitions for how the decoy sequences are generated.
Definition: ms_mascotoptions.hpp:156
Class for parsing and reading files in MSR format.
Definition: ms_mascotresfile_msr.hpp:105
Abstract base class of ms_mascotresfile_dat and ms_mascotresfile_msr.
Definition: ms_mascotresfilebase.hpp:72
Class which provides constructor parameters for either ms_peptidesummary or ms_proteinsummary.
Definition: ms_mascotresults_params.hpp:32
PSM_TYPE
Specifies the search pass and origin of the peptide match.
Definition: ms_peptide.hpp:107
Use this class to get peptide summary results.
Definition: ms_peptidesummary.hpp:51
Contains information of the current progress of a task being performed.
Definition: ms_progress_info.hpp:40
Definition: ms_proteinsummary.hpp:45
Use this class in order to read/write quantitation.xml.
Definition: ms_quant_configfile.hpp:52
Use this class in order to read in a taxonomy file.
Definition: ms_taxonomyfile.hpp:145
This class represents the file unimod.xml.
Definition: ms_umod_configfile.hpp:54
DATABASE_TYPE
Definition: ms_databaseoptions.hpp:39