Matrix Science Mascot Parser toolkit
 
Loading...
Searching...
No Matches
ms_protein.hpp
1/*
2##############################################################################
3# file: ms_mascotresprotein.hpp #
4# 'msparser' toolkit #
5# Encapsulates a protein - either for protein summary or peptide summary #
6##############################################################################
7# COPYRIGHT NOTICE #
8# Copyright 1998-2018 Matrix Science Limited All Rights Reserved. #
9# #
10##############################################################################
11# $Archive:: /Mowse/ms_mascotresfile/include/ms_mascotresprotein.hpp $ #
12# $Author: robertog@matrixscience.com $ #
13# $Date: 2024-10-18 17:17:30 +0100 $ #
14# $Revision: acbe3031f44525093eaa8eaf0cdcc49609a339f0 | MSPARSER_REL_3_1_0-2025-07-27-0-gea47708fac $ #
15# $NoKeywords:: $ #
16##############################################################################
17*/
18
19#ifndef MS_MASCOTRESPROTEIN_HPP
20#define MS_MASCOTRESPROTEIN_HPP
21
22
23// Includes from the standard template library
24#include <string>
25#include <list>
26#include <vector>
27#include <set>
28#include <map>
29#include <ostream>
30
31namespace msparser_internal {
32 class ms_protein_match_data;
33 class PEPINFO;
34 class ms_mascotresultsbase;
35 class ms_peptidesumsql;
36}
37
38namespace matrix_science {
39 class ms_mascotresults;
40 class ms_proteinsummary;
41 class ms_pepinfoSortByScore;
42 class ms_peptide;
43
50
56 class MS_MASCOTRESFILE_API ms_protein
57 {
58 public:
60
73 enum GROUP
74 {
79 GROUP_FAMILY
80 };
81
82#ifdef DUPLICATE
83 #ifdef _WIN32
84// #pragma message("WARNING: The identifier 'DUPLICATE' was defined but is incompatible with the definition for ms_protein")
85 #endif
86 #undef DUPLICATE
87#endif
88
90
98 {
103 DUPE_Ignored
104 };
105
107
117 {
118 MASS_NON_SELECT_NON_MATCH = 0x0001,
119 MASS_SELECT_NON_MATCH = 0x0010,
120 MASS_NON_SELECT_MATCH = 0x0100,
121 MASS_SELECT_MATCH = 0x1000
122 };
123
125
187 {
188 DPF_SEQUENCE = 0x0001,
189 DPF_CHARGE = 0x0002,
190 DPF_MODS = 0x0004,
191 DPF_UNIQUE = 0x0008,
192 DPF_NODUPSAMEQUERY = 0x0010
193 };
194
195 // Types for uniquely identifying a protein
196 typedef std::pair<int, std::string> dbIdxPlusAcc_t;
197 typedef std::vector<dbIdxPlusAcc_t> dbIdxPlusAccVect_t;
198 typedef std::set<dbIdxPlusAcc_t> dbIdxPlusAccSet_t;
199
201 ms_protein(const double score,
202 const std::string accession,
203 const bool updateScoreFromPepScores,
204 const int proteinSummaryHit = 0);
205
207 ms_protein(const ms_protein& src);
208
210 ~ms_protein();
211
212#ifndef SWIG
214 ms_protein(ms_protein&& src) noexcept;
215
217 ms_protein& operator=(ms_protein&& right) noexcept;
218
219 void swap(ms_protein& src) noexcept;
220
222 ms_protein& operator=(const ms_protein& right);
223#endif
224
226 void copyFrom(const ms_protein* src);
227
229 int64_t getProteinId() const;
230
232 void setProteinId(int64_t proteinId);
233
235 std::string getAccession() const;
236
238 int getDB() const;
239
241 void setDB(int dbIdx);
242
244 double getScore() const;
245
247 double getNonMudpitScore() const;
248
250 double getScoreWithET() const;
251
253 int getNumPeptides() const;
254
256 int getNumDisplayPeptides(bool aboveThreshold = false) const;
257
259 GROUP getGrouping() const;
260
261#ifndef DOXYGEN_SHOULD_SKIP_THIS
263 void setGrouping(GROUP g) { group_ = g; }
264
266 std::string getForCache(dbIdxPlusAccVect_t & supersetProteinsUnsorted,
267 dbIdxPlusAccVect_t & components) const;
268
270 bool setFromCache(const std::string & str, msparser_internal::ms_mascotresultsbase & results,
271 const dbIdxPlusAccVect_t & supersetProteinsUnsorted,
272 const dbIdxPlusAccVect_t & components,
273 const std::string & cdbFeatures);
274
276 std::vector<std::pair<int, int> > getIgnoredQPs() const;
277
279 bool isIgnoredQP(const int q, const int p) const;
280#endif
281
283 void getIgnoredQPs(std::vector<int> &q, std::vector<int> &p) const;
284
286 int getPeptideQuery (const int pepNumber) const;
287
289 int getPeptideP (const int pepNumber) const;
290
292 int getPepNumber(const int q, const int p) const;
293
295 int getPeptideFrame (const int pepNumber, const ms_peptide::PSM psmComponent = ms_peptide::PSM_COMPLETE) const;
296
298 long getPeptideStart (const int pepNumber, const ms_peptide::PSM psmComponent = ms_peptide::PSM_COMPLETE) const;
299
301 long getPeptideEnd (const int pepNumber, const ms_peptide::PSM psmComponent = ms_peptide::PSM_COMPLETE) const;
302
304 long getPeptideMultiplicity (const int pepNumber, const ms_peptide::PSM psmComponent = ms_peptide::PSM_COMPLETE) const;
305
307 DUPLICATE getPeptideDuplicate (const int pepNumber, const bool allowErrTolDuplicate = true) const;
308
310 double getPeptideIonsScore (const int pepNumber) const;
311
313 bool getPeptideIsBold (const int pepNumber) const;
314
316 void setPeptideIsBold (const int pepNumber);
317
319 bool getPeptideShowCheckbox (const int pepNumber) const;
320
322 void setPeptideShowCheckbox (const int pepNumber);
323
325 int getPeptideComponentID (const int pepNumber) const;
326
328 char getPeptideResidueBefore (const int pepNumber, const ms_peptide::PSM psmComponent = ms_peptide::PSM_COMPLETE) const;
329
331 char getPeptideResidueAfter (const int pepNumber, const ms_peptide::PSM psmComponent = ms_peptide::PSM_COMPLETE) const;
332
334 bool isASimilarProtein(const ms_protein * prot,
335 const ms_mascotresults * results,
336 const bool groupByQueryNumber = false);
337
339 std::string getSimilarProteinName() const;
340
342 int getSimilarProteinDB() const;
343
345 bool isSimilarProtein(const std::string & acc, const int dbIdx) const;
346
348 int getSimilarProteins(std::vector<std::string> & accessions, std::vector<int> & dbIdxs) const;
349
351 void setSimilarProtein(const ms_protein * prot);
352
355 void addOnePeptide( ms_mascotresults & results,
356 const int q, const int p,
357 const msparser_internal::ms_protein_match_data &proteinMatchData,
358 const double correctedScore,
359 const double uncorrectedScore,
360 const ms_protein * component,
361 const ms_peptide::SEARCH_PHASE searchPhase,
362 const bool isIgnored);
363
364
366 long getCoverage() const;
367
369 bool anyMatchToQuery(const int query) const;
370
372 bool anyMatchToQueryAndP(const int query, const int P) const;
373
375 std::string getUnmatchedMasses(ms_mascotresfilebase & resfile,
376 const int numDecimalPlaces = 2) const;
377
379 std::string getMasses(ms_mascotresfilebase & resfile,
380 const ms_proteinsummary & summary,
381 const unsigned int flags = MASS_SELECT_MATCH,
382 const int numDecimalPlaces = 2) const;
383
385 int getFrame() const;
386
388 bool anyBoldRedPeptides(const ms_mascotresults & results) const;
389
391 bool isUnigene() const;
392
394 void setIsUnigeneEntry();
395
397 bool isPMFMixture() const;
398
400 void setIsPMFMixture();
401
403 bool isUpdateScoreFromPepScores() const;
404
406 void sortPeptides(const ms_mascotresults & results, bool keepAlive = false, int keepAlivePercent = 0, const char * keepAliveAccession = "", int keepAliveCount = 0);
407
409 int getNumComponents() const;
410
412 const ms_protein * getComponent(const int componentNumber) const;
413
415 int getProteinSummaryHit() const;
416
418 double getRMSDeltas(const ms_mascotresults & results) const;
419
421 int getHitNumber() const;
422
429 void setHitNumber(const int hit) { hitNum_ = hit;}
430
432 int getMemberNumber() const;
433
435 int getLongestPeptideLen() const;
436
438 int getNumDistinctPeptides(bool aboveThreshold = false,
439 DISTINCT_PEPTIDE_FLAGS flags = DPF_SEQUENCE) const;
440
442 int getNumDistinctPeptideRepeats(
443 int distinctIndex, // 1..getNumDistinctPeptides
444 bool aboveThreshold = false,
445 DISTINCT_PEPTIDE_FLAGS flags = DPF_SEQUENCE) const;
446
448 ms_peptide getDistinctPeptide(
449 int distinctIndex, // 1..getNumDistinctPeptides
450 int repeatIndex = 1, // 1..getNumDistinctPeptideRepeats
451 bool aboveThreshold = false,
452 DISTINCT_PEPTIDE_FLAGS flags = DPF_SEQUENCE) const;
453
455 int getLongestSigPeptideLen() const;
456
458 int getNumObservedForEmPAI() const;
459#ifndef SWIG
461
470 friend inline bool operator<(const ms_protein & lhs, const ms_protein & rhs) {
471 if (lhs.dbIdx_ == rhs.dbIdx_) {
472 if ( lhs.proteinSummaryHit_ == 0 ) {
473 return lhs.accession_ < rhs.accession_;
474 } else { // i.e ms_proteinsummary - see parser bug 493
475 if ( lhs.accession_ == rhs.accession_) {
476 return lhs.getFrame() < rhs.getFrame();
477 } else {
478 return lhs.accession_ < rhs.accession_;
479 }
480 }
481 } else {
482 return lhs.dbIdx_ < rhs.dbIdx_;
483 }
484 }
485#endif
486 // Undocumented function for fast access
487 const char * getAccessionStr() const { return accession_.c_str(); }
488
489 const std::vector<std::pair<int,int64_t>>& getSupersetProteinIds() const { return supersetProteinIdsUnsorted_; }
490 const std::vector<std::pair<int,int64_t>>& getComponentProteinIds() const { return componentProteinIds_; }
491
492 std::string stringify() const;
493
494 private:
495
496 // Values for flags_
497 // These ones are not cached in the cdb file (obviously!)
498 static constexpr unsigned char FL_LOADED_BASE_INFO_FROM_CACHE = 0x01; // False if not using cdb, otherwise true once setFromCache() has been called
499 static constexpr unsigned char FL_LOADED_QP_FROM_CACHE = 0x02; // True is using cdb and just q/p values loaded.
500 static constexpr unsigned char FL_LOADED_ALL_FROM_CACHE = 0x02; // Same as FL_SORTED
501 // These ones are cached in the cdb file
502 static constexpr unsigned char FL_SORTED = 0x10; // Sorting the list of peptides is expensive - don't repeat...
503 static constexpr unsigned char FL_UNIGENE = 0x20; // For unigene, we need to get the description line from the unigene file
504 static constexpr unsigned char FL_UPDATE_SCORE_FROM_PEP_SCORES = 0x40; // For protein summary, the protein score is calculated by
505 // nph-mascot.exe, and is in the results file. For the
506 // peptide summary, the score is calculated by adding the ions
507 // scores
508
509 static constexpr unsigned char FL_PMF_MIXTURE = 0x80; // True if protein actually originates from a PMF mixture
510
511 void initialiseDistinctPeptideTree(
512 bool aboveThreshold,
513 DISTINCT_PEPTIDE_FLAGS flags) const;
514
515 ms_errs* getErrorHandler() const;
516
517 // --- Start of uncached variables
518 mutable std::vector<msparser_internal::PEPINFO *> peptides_; // sort by query
519 mutable std::vector<msparser_internal::PEPINFO *> ignoredPeptides_;
520 mutable std::vector<msparser_internal::PEPINFO> allPeptides_;
521
522 msparser_internal::ms_mascotresultsbase * results_;
523// bool loadedFromCache_;
524 // --- End of uncached variables
525
526 // Start of all cached variables
527 unsigned char flags_; // See FL_... not all bits are cached
528
529 int numPeptides_; // Only used if loadedFromCache_ is true - otherwise peptides_.size();
530 mutable int numDisplayPeptides_;
531 mutable int numDisplayPeptidesAboveThresh_;
532 mutable int numDistinctPeptides_;
533 mutable int numDistinctPeptidesAboveThresh_;
534 mutable int numDistinctUniquePeptides_;
535 mutable int numDistinctUniqPepAboveThresh_;
536 mutable int lenLongestPeptideAboveThresh_;
537 mutable int numObservedForEmPAI_;
538 mutable int frame_;
539 mutable bool distinctPeptideAboveThreshold_;
540 mutable DISTINCT_PEPTIDE_FLAGS distinctPeptideFlags_;
541 mutable std::list<std::list<ms_peptide*> > distinctPeptideTree_;
542 dbIdxPlusAccSet_t supersetProteins_; // This one is filled when loading from cache
543 dbIdxPlusAccVect_t supersetProteinsUnsorted_;
544 // Same as supersetProteinsUnsorted_ but using protein ids
545 std::vector<std::pair<int,int64_t>> supersetProteinIdsUnsorted_;
546
547 // For unignene and PMF mixture, the protein is really a 'pseudo'
548 // protein, made up from a number of 'real' proteins
549 dbIdxPlusAccVect_t components_;
550 // Same as components_but using protein ids
551 std::vector<std::pair<int,int64_t>> componentProteinIds_;
552
553 int64_t proteinId_;
554 std::string accession_;
555 int dbIdx_;
556 double score_;
557 double nonMudPITScore_;
558 double scoreWithET_;
559 GROUP group_;
560 int proteinSummaryHit_;
561 int hitNum_;
562 mutable int memberNum_;
563 int longestPeptideLen_; // Useful with minPepLenInPepSummary
564 mutable long coverage_;
565// bool pmfMixture_; // True if protein actually originates from a PMF mixture
566// bool sorted_; // Sorting the list of peptides is expensive - don't repeat...
567// bool unigene_; // For unigene, we need to get the description line from the unigene file
568// bool updateScoreFromPepScores_; // For protein summary, the protein score is calculated by
569 // nph-mascot.exe, and is in the results file. For the
570 // peptide summary, the score is calculated by adding the ions
571 // scores
572
573 // Functions
574 void copyPeptidePointers(std::vector<msparser_internal::PEPINFO *> &pointersTo, const std::vector<msparser_internal::PEPINFO *> &pointersFrom, const ms_protein *src);
575 void checkFromCache(const char * calledBy) const;
576 void checkQPFromCache(const char * calledBy) const;
577 bool isFlagSet(unsigned char fl) const { return (flags_ & fl)?true:false; }
578 void setFlag(unsigned char fl, bool val) {
579 if (val) {
580 flags_ |= fl;
581 } else {
582 //the standard says that complement must promote to an int... there is no performance difference though
583 flags_ = static_cast<unsigned char>(flags_ & ~fl);
584 }
585 }
586
587 static bool isVarModStrEmpty(const std::string &str);
588
589 friend class prot_sort;
590 friend class ms_pepinfoSortByScore;
591 friend class msparser_internal::ms_peptidesumsql;
592 };
593#ifndef SWIG
594 // Helper class - don't use from outside library
595 class ms_proteinPtrSortByAccession
596 {
597 public:
598 bool operator() (const ms_protein * p1, const ms_protein * p2) const {
599 return (*p1 < *p2);
600 }
601 };
602
603 class ms_proteinPtrSortByScore
604 {
605 public:
606 bool operator() (const ms_protein * p1, const ms_protein * p2) const {
607 if (p1->getScore() != p2->getScore()) {
608 return (p1->getScore() > p2->getScore());
609 } else {
610 return (*p1 < *p2);
611 }
612 }
613 };
614
615
616 class ms_pepinfoSortByScore
617 {
618 public:
619 ms_pepinfoSortByScore(std::pair<bool, bool> pairParam): removeDiffPos_(pairParam.first), anyLibraryMatches_(pairParam.second) { }
620 bool operator() (const msparser_internal::PEPINFO * p1, const msparser_internal::PEPINFO * p2) const;
621 ms_pepinfoSortByScore(const ms_pepinfoSortByScore& other): removeDiffPos_(other.removeDiffPos_), anyLibraryMatches_(other.anyLibraryMatches_){}
622 ms_pepinfoSortByScore& operator=(const ms_pepinfoSortByScore& other)
623 {
624 if (&other != this) {
625 removeDiffPos_ = other.removeDiffPos_;
626 anyLibraryMatches_ = other.anyLibraryMatches_;
627 }
628 return *this;
629 }
630
631 private:
632 bool removeDiffPos_;
633 bool anyLibraryMatches_;
634 };
635
636 inline std::ostream& operator << (std::ostream& out, const ms_protein& prot)
637 {
638 out << prot.stringify();
639 return out;
640 }
641
642#endif // end of resfile_group
644} // matrix_science namespace
645
646
647#endif // MS_MASCOTRESPROTEIN_HPP
648
649/*------------------------------- End of File -------------------------------*/
650
651
652
653
Abstract base class of ms_mascotresfile_dat and ms_mascotresfile_msr.
Definition: ms_mascotresfilebase.hpp:72
Abstract class for either ms_peptidesummary or ms_proteinsummary.
Definition: ms_mascotresults.hpp:83
PSM
Type of data to return from accessor methods.
Definition: ms_peptide.hpp:98
This class encapsulates a protein in the mascot results file.
Definition: ms_protein.hpp:57
DISTINCT_PEPTIDE_FLAGS
Enum for getNumDistinctPeptides().
Definition: ms_protein.hpp:187
GROUP
Enum to say if a protein is similar to another higher scoring protein.
Definition: ms_protein.hpp:74
@ GROUP_NO
Does not contain same set (or subset) of peptides as another proteins. A 'lead' protein.
Definition: ms_protein.hpp:76
@ GROUP_UNKNOWN
No information about grouping.
Definition: ms_protein.hpp:75
@ GROUP_COMPLETE
Contains an identical set of peptides to one or more other proteins.
Definition: ms_protein.hpp:78
@ GROUP_SUBSET
Contains a subset of peptides in one ore more other proteins.
Definition: ms_protein.hpp:77
ms_protein(const double score, const std::string accession, const bool updateScoreFromPepScores, const int proteinSummaryHit=0)
Constructors - used from ms_proteinsummary and ms_peptidesummary.
MASS_FLAGS
enum for each protein to specify what masses to select.
Definition: ms_protein.hpp:117
int getFrame() const
Returns the frame number for the protein.
Definition: ms_protein.cpp:1454
friend bool operator<(const ms_protein &lhs, const ms_protein &rhs)
Protein objects perform a simple sort of themselves by database ID and then accession.
Definition: ms_protein.hpp:470
DUPLICATE
Enum for the each peptide in the protein to indicate if it is a duplicate.
Definition: ms_protein.hpp:98
@ DUPE_DuplicateSameQuery
Another match for the same query with the same peptide string got a higher score (different mods).
Definition: ms_protein.hpp:101
@ DUPE_Duplicate
Another peptide from a different query with the same sequence as this got a higher score.
Definition: ms_protein.hpp:100
@ DUPE_NotDuplicate
There are no other peptides with the same sequence in this protein - from this query or other queries...
Definition: ms_protein.hpp:99
@ DUPE_HighestScoringDuplicate
There is at least one other peptide the same as this with a lower score.
Definition: ms_protein.hpp:102
Definition: ms_proteinsummary.hpp:45