Matrix Science Mascot Parser toolkit
 
Loading...
Searching...
No Matches
ms_protein.hpp
1/*
2##############################################################################
3# file: ms_mascotresprotein.hpp #
4# 'msparser' toolkit #
5# Encapsulates a protein - either for protein summary or peptide summary #
6##############################################################################
7# COPYRIGHT NOTICE #
8# Copyright 1998-2018 Matrix Science Limited All Rights Reserved. #
9# #
10##############################################################################
11# $Archive:: /Mowse/ms_mascotresfile/include/ms_mascotresprotein.hpp $ #
12# $Author: robertog@matrixscience.com $ #
13# $Date: 2023-08-29 15:05:00 +0100 $ #
14# $Revision: 59a8ab98151c86e0bdeddc2efea60d287d6087ba | MSPARSER_REL_3_0_0-2024-09-24-0-g93ebaeb4f4 $ #
15# $NoKeywords:: $ #
16##############################################################################
17*/
18
19#ifndef MS_MASCOTRESPROTEIN_HPP
20#define MS_MASCOTRESPROTEIN_HPP
21
22
23// Includes from the standard template library
24#include <string>
25#include <list>
26#include <vector>
27#include <set>
28#include <map>
29#include <ostream>
30
31namespace msparser_internal {
32 class ms_protein_match_data;
33 class PEPINFO;
34 class ms_mascotresultsbase;
35 class ms_peptidesumsql;
36}
37
38namespace matrix_science {
39 class ms_mascotresults;
40 class ms_proteinsummary;
41 class ms_pepinfoSortByScore;
42 class ms_peptide;
43
50
56 class MS_MASCOTRESFILE_API ms_protein
57 {
58 public:
60
73 enum GROUP
74 {
79 GROUP_FAMILY
80 };
81
82#ifdef DUPLICATE
83 #ifdef _WIN32
84// #pragma message("WARNING: The identifier 'DUPLICATE' was defined but is incompatible with the definition for ms_protein")
85 #endif
86 #undef DUPLICATE
87#endif
88
90
98 {
103 DUPE_Ignored
104 };
105
107
117 {
118 MASS_NON_SELECT_NON_MATCH = 0x0001,
119 MASS_SELECT_NON_MATCH = 0x0010,
120 MASS_NON_SELECT_MATCH = 0x0100,
121 MASS_SELECT_MATCH = 0x1000
122 };
123
125
187 {
188 DPF_SEQUENCE = 0x0001,
189 DPF_CHARGE = 0x0002,
190 DPF_MODS = 0x0004,
191 DPF_UNIQUE = 0x0008,
192 DPF_NODUPSAMEQUERY = 0x0010
193 };
194
195 // Types for uniquely identifying a protein
196 typedef std::pair<int, std::string> dbIdxPlusAcc_t;
197 typedef std::vector<dbIdxPlusAcc_t> dbIdxPlusAccVect_t;
198 typedef std::set<dbIdxPlusAcc_t> dbIdxPlusAccSet_t;
199
200
202 ms_protein(const double score,
203 const std::string accession,
204 const bool updateScoreFromPepScores,
205 const int proteinSummaryHit = 0);
206
208 ms_protein(const ms_protein& src);
209
211 ~ms_protein();
212
213#ifndef SWIG
215 ms_protein& operator=(const ms_protein& right);
216#endif
217
219 void copyFrom(const ms_protein* src);
220
222 int64_t getProteinId() const;
223
225 void setProteinId(int64_t proteinId);
226
228 std::string getAccession() const;
229
231 int getDB() const;
232
234 void setDB(int dbIdx);
235
237 double getScore() const;
238
240 double getNonMudpitScore() const;
241
243 double getScoreWithET() const;
244
246 int getNumPeptides() const;
247
249 int getNumDisplayPeptides(bool aboveThreshold = false) const;
250
252 GROUP getGrouping() const;
253
254#ifndef DOXYGEN_SHOULD_SKIP_THIS
256 void setGrouping(GROUP g) { group_ = g; }
257
259 std::string getForCache(dbIdxPlusAccVect_t & supersetProteinsUnsorted,
260 dbIdxPlusAccVect_t & components) const;
261
263 bool setFromCache(const std::string & str, msparser_internal::ms_mascotresultsbase & results,
264 const dbIdxPlusAccVect_t & supersetProteinsUnsorted,
265 const dbIdxPlusAccVect_t & components,
266 const std::string & cdbFeatures);
267
269 std::vector<std::pair<int, int> > getIgnoredQPs() const;
270
272 bool isIgnoredQP(const int q, const int p) const;
273#endif
274
276 void getIgnoredQPs(std::vector<int> &q, std::vector<int> &p) const;
277
279 int getPeptideQuery (const int pepNumber) const;
280
282 int getPeptideP (const int pepNumber) const;
283
285 int getPepNumber(const int q, const int p) const;
286
288 int getPeptideFrame (const int pepNumber, const ms_peptide::PSM psmComponent = ms_peptide::PSM_COMPLETE) const;
289
291 long getPeptideStart (const int pepNumber, const ms_peptide::PSM psmComponent = ms_peptide::PSM_COMPLETE) const;
292
294 long getPeptideEnd (const int pepNumber, const ms_peptide::PSM psmComponent = ms_peptide::PSM_COMPLETE) const;
295
297 long getPeptideMultiplicity (const int pepNumber, const ms_peptide::PSM psmComponent = ms_peptide::PSM_COMPLETE) const;
298
300 DUPLICATE getPeptideDuplicate (const int pepNumber, const bool allowErrTolDuplicate = true) const;
301
303 double getPeptideIonsScore (const int pepNumber) const;
304
306 bool getPeptideIsBold (const int pepNumber) const;
307
309 void setPeptideIsBold (const int pepNumber);
310
312 bool getPeptideShowCheckbox (const int pepNumber) const;
313
315 void setPeptideShowCheckbox (const int pepNumber);
316
318 int getPeptideComponentID (const int pepNumber) const;
319
321 char getPeptideResidueBefore (const int pepNumber, const ms_peptide::PSM psmComponent = ms_peptide::PSM_COMPLETE) const;
322
324 char getPeptideResidueAfter (const int pepNumber, const ms_peptide::PSM psmComponent = ms_peptide::PSM_COMPLETE) const;
325
327 bool isASimilarProtein(const ms_protein * prot,
328 const ms_mascotresults * results,
329 const bool groupByQueryNumber = false);
330
332 std::string getSimilarProteinName() const;
333
335 int getSimilarProteinDB() const;
336
338 bool isSimilarProtein(const std::string & acc, const int dbIdx) const;
339
341 int getSimilarProteins(std::vector<std::string> & accessions, std::vector<int> & dbIdxs) const;
342
344 void setSimilarProtein(const ms_protein * prot);
345
348 void addOnePeptide( ms_mascotresults & results,
349 const int q, const int p,
350 const msparser_internal::ms_protein_match_data &proteinMatchData,
351 const double correctedScore,
352 const double uncorrectedScore,
353 const ms_protein * component,
354 const ms_peptide::SEARCH_PHASE searchPhase,
355 const bool isIgnored);
356
357
359 long getCoverage() const;
360
362 bool anyMatchToQuery(const int query) const;
363
365 bool anyMatchToQueryAndP(const int query, const int P) const;
366
368 std::string getUnmatchedMasses(ms_mascotresfilebase & resfile,
369 const int numDecimalPlaces = 2) const;
370
372 std::string getMasses(ms_mascotresfilebase & resfile,
373 const ms_proteinsummary & summary,
374 const unsigned int flags = MASS_SELECT_MATCH,
375 const int numDecimalPlaces = 2) const;
376
378 int getFrame() const;
379
381 bool anyBoldRedPeptides(const ms_mascotresults & results) const;
382
384 bool isUnigene() const;
385
387 void setIsUnigeneEntry();
388
390 bool isPMFMixture() const;
391
393 void setIsPMFMixture();
394
396 bool isUpdateScoreFromPepScores() const;
397
399 void sortPeptides(const ms_mascotresults & results, bool keepAlive = false, int keepAlivePercent = 0, const char * keepAliveAccession = "", int keepAliveCount = 0);
400
402 int getNumComponents() const;
403
405 const ms_protein * getComponent(const int componentNumber) const;
406
408 int getProteinSummaryHit() const;
409
411 double getRMSDeltas(const ms_mascotresults & results) const;
412
414 int getHitNumber() const;
415
422 void setHitNumber(const int hit) { hitNum_ = hit;}
423
425 int getMemberNumber() const;
426
428 int getLongestPeptideLen() const;
429
431 int getNumDistinctPeptides(bool aboveThreshold = false,
432 DISTINCT_PEPTIDE_FLAGS flags = DPF_SEQUENCE) const;
433
435 int getNumDistinctPeptideRepeats(
436 int distinctIndex, // 1..getNumDistinctPeptides
437 bool aboveThreshold = false,
438 DISTINCT_PEPTIDE_FLAGS flags = DPF_SEQUENCE) const;
439
441 ms_peptide getDistinctPeptide(
442 int distinctIndex, // 1..getNumDistinctPeptides
443 int repeatIndex = 1, // 1..getNumDistinctPeptideRepeats
444 bool aboveThreshold = false,
445 DISTINCT_PEPTIDE_FLAGS flags = DPF_SEQUENCE) const;
446
448 int getLongestSigPeptideLen() const;
449
451 int getNumObservedForEmPAI() const;
452#ifndef SWIG
454
463 friend inline bool operator<(const ms_protein & lhs, const ms_protein & rhs) {
464 if (lhs.dbIdx_ == rhs.dbIdx_) {
465 if ( lhs.proteinSummaryHit_ == 0 ) {
466 return lhs.accession_ < rhs.accession_;
467 } else { // i.e ms_proteinsummary - see parser bug 493
468 if ( lhs.accession_ == rhs.accession_) {
469 return lhs.getFrame() < rhs.getFrame();
470 } else {
471 return lhs.accession_ < rhs.accession_;
472 }
473 }
474 } else {
475 return lhs.dbIdx_ < rhs.dbIdx_;
476 }
477 }
478#endif
479 // Undocumented function for fast access
480 const char * getAccessionStr() const { return accession_.c_str(); }
481
482 const std::vector<std::pair<int,int64_t>>& getSupersetProteinIds() const { return supersetProteinIdsUnsorted_; }
483 const std::vector<std::pair<int,int64_t>>& getComponentProteinIds() const { return componentProteinIds_; }
484
485 std::string stringify() const;
486
487 private:
488
489 // Values for flags_
490 // These ones are not cached in the cdb file (obviously!)
491 static constexpr unsigned char FL_LOADED_BASE_INFO_FROM_CACHE = 0x01; // False if not using cdb, otherwise true once setFromCache() has been called
492 static constexpr unsigned char FL_LOADED_QP_FROM_CACHE = 0x02; // True is using cdb and just q/p values loaded.
493 static constexpr unsigned char FL_LOADED_ALL_FROM_CACHE = 0x02; // Same as FL_SORTED
494 // These ones are cached in the cdb file
495 static constexpr unsigned char FL_SORTED = 0x10; // Sorting the list of peptides is expensive - don't repeat...
496 static constexpr unsigned char FL_UNIGENE = 0x20; // For unigene, we need to get the description line from the unigene file
497 static constexpr unsigned char FL_UPDATE_SCORE_FROM_PEP_SCORES = 0x40; // For protein summary, the protein score is calculated by
498 // nph-mascot.exe, and is in the results file. For the
499 // peptide summary, the score is calculated by adding the ions
500 // scores
501
502 static constexpr unsigned char FL_PMF_MIXTURE = 0x80; // True if protein actually originates from a PMF mixture
503
504 void initialiseDistinctPeptideTree(
505 bool aboveThreshold,
506 DISTINCT_PEPTIDE_FLAGS flags) const;
507
508 ms_errs* getErrorHandler() const;
509
510 // --- Start of uncached variables
511 mutable std::vector<msparser_internal::PEPINFO *> peptides_; // sort by query
512 mutable std::vector<msparser_internal::PEPINFO *> ignoredPeptides_;
513 mutable std::vector<msparser_internal::PEPINFO> allPeptides_;
514
515 msparser_internal::ms_mascotresultsbase * results_;
516// bool loadedFromCache_;
517 // --- End of uncached variables
518
519 // Start of all cached variables
520 unsigned char flags_; // See FL_... not all bits are cached
521
522 int numPeptides_; // Only used if loadedFromCache_ is true - otherwise peptides_.size();
523 mutable int numDisplayPeptides_;
524 mutable int numDisplayPeptidesAboveThresh_;
525 mutable int numDistinctPeptides_;
526 mutable int numDistinctPeptidesAboveThresh_;
527 mutable int numDistinctUniquePeptides_;
528 mutable int numDistinctUniqPepAboveThresh_;
529 mutable int lenLongestPeptideAboveThresh_;
530 mutable int numObservedForEmPAI_;
531 mutable int frame_;
532 mutable bool distinctPeptideAboveThreshold_;
533 mutable DISTINCT_PEPTIDE_FLAGS distinctPeptideFlags_;
534 mutable std::list<std::list<ms_peptide*> > distinctPeptideTree_;
535 dbIdxPlusAccSet_t supersetProteins_; // This one is filled when loading from cache
536 dbIdxPlusAccVect_t supersetProteinsUnsorted_;
537 // Same as supersetProteinsUnsorted_ but using protein ids
538 std::vector<std::pair<int,int64_t>> supersetProteinIdsUnsorted_;
539
540 // For unignene and PMF mixture, the protein is really a 'pseudo'
541 // protein, made up from a number of 'real' proteins
542 dbIdxPlusAccVect_t components_;
543 // Same as components_but using protein ids
544 std::vector<std::pair<int,int64_t>> componentProteinIds_;
545
546 int64_t proteinId_;
547 std::string accession_;
548 int dbIdx_;
549 double score_;
550 double nonMudPITScore_;
551 double scoreWithET_;
552 GROUP group_;
553 int proteinSummaryHit_;
554 int hitNum_;
555 mutable int memberNum_;
556 int longestPeptideLen_; // Useful with minPepLenInPepSummary
557 mutable long coverage_;
558// bool pmfMixture_; // True if protein actually originates from a PMF mixture
559// bool sorted_; // Sorting the list of peptides is expensive - don't repeat...
560// bool unigene_; // For unigene, we need to get the description line from the unigene file
561// bool updateScoreFromPepScores_; // For protein summary, the protein score is calculated by
562 // nph-mascot.exe, and is in the results file. For the
563 // peptide summary, the score is calculated by adding the ions
564 // scores
565
566 // Functions
567 void copyPeptidePointers(std::vector<msparser_internal::PEPINFO *> &pointersTo, const std::vector<msparser_internal::PEPINFO *> &pointersFrom, const ms_protein *src);
568 void checkFromCache(const char * calledBy) const;
569 void checkQPFromCache(const char * calledBy) const;
570 bool isFlagSet(unsigned char fl) const { return (flags_ & fl)?true:false; }
571 void setFlag(unsigned char fl, bool val) {
572 if (val) {
573 flags_ |= fl;
574 } else {
575 //the standard says that complement must promote to an int... there is no performance difference though
576 flags_ = static_cast<unsigned char>(flags_ & ~fl);
577 }
578 }
579
580 static bool isVarModStrEmpty(const std::string &str);
581
582 friend class prot_sort;
583 friend class ms_pepinfoSortByScore;
584 friend class msparser_internal::ms_peptidesumsql;
585 };
586#ifndef SWIG
587 // Helper class - don't use from outside library
588 class ms_proteinPtrSortByAccession
589 {
590 public:
591 bool operator() (const ms_protein * p1, const ms_protein * p2) const {
592 return (*p1 < *p2);
593 }
594 };
595
596 class ms_proteinPtrSortByScore
597 {
598 public:
599 bool operator() (const ms_protein * p1, const ms_protein * p2) const {
600 if (p1->getScore() != p2->getScore()) {
601 return (p1->getScore() > p2->getScore());
602 } else {
603 return (*p1 < *p2);
604 }
605 }
606 };
607
608
609 class ms_pepinfoSortByScore
610 {
611 public:
612 ms_pepinfoSortByScore(std::pair<bool, bool> pairParam): removeDiffPos_(pairParam.first), anyLibraryMatches_(pairParam.second) { }
613 bool operator() (const msparser_internal::PEPINFO * p1, const msparser_internal::PEPINFO * p2) const;
614 ms_pepinfoSortByScore(const ms_pepinfoSortByScore& other): removeDiffPos_(other.removeDiffPos_), anyLibraryMatches_(other.anyLibraryMatches_){}
615 ms_pepinfoSortByScore& operator=(const ms_pepinfoSortByScore& other)
616 {
617 if (&other != this) {
618 removeDiffPos_ = other.removeDiffPos_;
619 anyLibraryMatches_ = other.anyLibraryMatches_;
620 }
621 return *this;
622 }
623
624 private:
625 bool removeDiffPos_;
626 bool anyLibraryMatches_;
627 };
628
629 inline std::ostream& operator << (std::ostream& out, const ms_protein& prot)
630 {
631 out << prot.stringify();
632 return out;
633 }
634
635#endif // end of resfile_group
637} // matrix_science namespace
638
639
640#endif // MS_MASCOTRESPROTEIN_HPP
641
642/*------------------------------- End of File -------------------------------*/
643
644
645
646
Abstract base class of ms_mascotresfile_dat and ms_mascotresfile_msr.
Definition: ms_mascotresfilebase.hpp:72
Abstract class for either ms_peptidesummary or ms_proteinsummary.
Definition: ms_mascotresults.hpp:83
PSM
Type of data to return from accessor methods.
Definition: ms_peptide.hpp:98
This class encapsulates a protein in the mascot results file.
Definition: ms_protein.hpp:57
DISTINCT_PEPTIDE_FLAGS
Enum for getNumDistinctPeptides().
Definition: ms_protein.hpp:187
GROUP
Enum to say if a protein is similar to another higher scoring protein.
Definition: ms_protein.hpp:74
@ GROUP_NO
Does not contain same set (or subset) of peptides as another proteins. A 'lead' protein.
Definition: ms_protein.hpp:76
@ GROUP_UNKNOWN
No information about grouping.
Definition: ms_protein.hpp:75
@ GROUP_COMPLETE
Contains an identical set of peptides to one or more other proteins.
Definition: ms_protein.hpp:78
@ GROUP_SUBSET
Contains a subset of peptides in one ore more other proteins.
Definition: ms_protein.hpp:77
ms_protein(const double score, const std::string accession, const bool updateScoreFromPepScores, const int proteinSummaryHit=0)
Constructors - used from ms_proteinsummary and ms_peptidesummary.
MASS_FLAGS
enum for each protein to specify what masses to select.
Definition: ms_protein.hpp:117
int getFrame() const
Returns the frame number for the protein.
Definition: ms_protein.cpp:1403
friend bool operator<(const ms_protein &lhs, const ms_protein &rhs)
Protein objects perform a simple sort of themselves by database ID and then accession.
Definition: ms_protein.hpp:463
DUPLICATE
Enum for the each peptide in the protein to indicate if it is a duplicate.
Definition: ms_protein.hpp:98
@ DUPE_DuplicateSameQuery
Another match for the same query with the same peptide string got a higher score (different mods).
Definition: ms_protein.hpp:101
@ DUPE_Duplicate
Another peptide from a different query with the same sequence as this got a higher score.
Definition: ms_protein.hpp:100
@ DUPE_NotDuplicate
There are no other peptides with the same sequence in this protein - from this query or other queries...
Definition: ms_protein.hpp:99
@ DUPE_HighestScoringDuplicate
There is at least one other peptide the same as this with a lower score.
Definition: ms_protein.hpp:102
Definition: ms_proteinsummary.hpp:45