This class encapsulates a protein in the mascot results file. More...
#include <ms_protein.hpp>
Public Types | |
enum | DISTINCT_PEPTIDE_FLAGS { DPF_SEQUENCE = 0x0001 , DPF_CHARGE = 0x0002 , DPF_MODS = 0x0004 , DPF_UNIQUE = 0x0008 , DPF_NODUPSAMEQUERY = 0x0010 } |
Enum for getNumDistinctPeptides(). More... | |
enum | DUPLICATE { DUPE_NotDuplicate , DUPE_Duplicate , DUPE_DuplicateSameQuery , DUPE_HighestScoringDuplicate , DUPE_Ignored } |
Enum for the each peptide in the protein to indicate if it is a duplicate. More... | |
enum | GROUP { GROUP_UNKNOWN , GROUP_NO , GROUP_SUBSET , GROUP_COMPLETE , GROUP_FAMILY } |
Enum to say if a protein is similar to another higher scoring protein. More... | |
enum | MASS_FLAGS { MASS_NON_SELECT_NON_MATCH = 0x0001 , MASS_SELECT_NON_MATCH = 0x0010 , MASS_NON_SELECT_MATCH = 0x0100 , MASS_SELECT_MATCH = 0x1000 } |
enum for each protein to specify what masses to select. More... | |
Public Member Functions | |
ms_protein (const double score, const std::string accession, const bool updateScoreFromPepScores, const int proteinSummaryHit=0) | |
Constructors - used from ms_proteinsummary and ms_peptidesummary. | |
ms_protein (const ms_protein &src) | |
Copying constructor. | |
~ms_protein () | |
Destructor - called automatically - don't call explicitly from Perl or Java. | |
bool | anyBoldRedPeptides (const ms_mascotresults &results) const |
Returns true if any of the peptides in the match were top scoring and not seen before. | |
bool | anyMatchToQuery (const int query) const |
See if any match to this query. | |
bool | anyMatchToQueryAndP (const int query, const int P) const |
See if any match to this query and 'P' (rank / hit). | |
void | copyFrom (const ms_protein *src) |
Copies all content from another instance of the class. | |
std::string | getAccession () const |
Return the accession string for a protein. | |
const ms_protein * | getComponent (const int componentNumber) const |
For UniGene and PMF mixture return the 'component' protein. | |
long | getCoverage () const |
Return the number of residues covered. | |
int | getDB () const |
Return the index of the database where the sequence is found. | |
ms_peptide | getDistinctPeptide (int distinctIndex, int repeatIndex=1, bool aboveThreshold=false, DISTINCT_PEPTIDE_FLAGS flags=DPF_SEQUENCE) const |
Return the peptide repeat of the distinct peptide in the protein's peptide matches. | |
int | getFrame () const |
Returns the frame number for the protein. | |
GROUP | getGrouping () const |
Returns a flag which shows if this protein only contain the same peptides as those in another protein. | |
int | getHitNumber () const |
Returns the hit number in the results list. | |
void | getIgnoredQPs (std::vector< int > &q, std::vector< int > &p) const |
Return a list of queries and ranks that would have been part of this protein hit had they not been removed by IgnoreIonsScoreBelow. | |
int | getLongestPeptideLen () const |
Return the length (in residues) of the longest peptide in the protein. | |
int | getLongestSigPeptideLen () const |
Return the length (in residues) of the longest significant peptide in the protein. | |
std::string | getMasses (ms_mascotresfilebase &resfile, const ms_proteinsummary &summary, const unsigned int flags=MASS_SELECT_MATCH, const int numDecimalPlaces=2) const |
Return a list of comma separated experimental masses according to a specified filter. | |
int | getMemberNumber () const |
Returns the member number within a family in the results list. | |
double | getNonMudpitScore () const |
Will only return a different score from getScore() if the MSRES_MUDPIT_PROTEIN_SCORE flag has been specified. | |
int | getNumComponents () const |
For UniGene and PMF mixture, return number of 'component' proteins. | |
int | getNumDisplayPeptides (bool aboveThreshold=false) const |
Return the number of peptides excluding those that with duplicate matches to same query. | |
int | getNumDistinctPeptideRepeats (int distinctIndex, bool aboveThreshold=false, DISTINCT_PEPTIDE_FLAGS flags=DPF_SEQUENCE) const |
Return the number of repeats of the distinct peptide in the protein's peptide matches. | |
int | getNumDistinctPeptides (bool aboveThreshold=false, DISTINCT_PEPTIDE_FLAGS flags=DPF_SEQUENCE) const |
Return the number of distinct peptides in the protein sequence. | |
int | getNumObservedForEmPAI () const |
Return the number of peptides 'observed' for emPAI quantitation calculation. | |
int | getNumPeptides () const |
Return the number of peptides that had a match in this protein. | |
int | getPepNumber (const int q, const int p) const |
Return the pepNumber given query and rank. | |
int | getPeptideComponentID (const int pepNumber) const |
Returns 0 except for a UniGene entry or a PMF mixture entry. | |
DUPLICATE | getPeptideDuplicate (const int pepNumber, const bool allowErrTolDuplicate=true) const |
Return the DUPLICATE status given the peptide 'number'. | |
long | getPeptideEnd (const int pepNumber, const ms_peptide::PSM psmComponent=ms_peptide::PSM_COMPLETE) const |
Return the peptide end residue given the peptide 'number'. | |
int | getPeptideFrame (const int pepNumber, const ms_peptide::PSM psmComponent=ms_peptide::PSM_COMPLETE) const |
Return the frame number given the peptide 'number'. | |
double | getPeptideIonsScore (const int pepNumber) const |
Return the ions score within this protein context given the peptide 'number'. | |
bool | getPeptideIsBold (const int pepNumber) const |
Returns true if this peptide should be displayed in bold in a Mascot report. | |
long | getPeptideMultiplicity (const int pepNumber, const ms_peptide::PSM psmComponent=ms_peptide::PSM_COMPLETE) const |
Return the number of precursor matches in this protein for the specified peptide 'number'. | |
int | getPeptideP (const int pepNumber) const |
Return the 'rank' number given the peptide 'number'. | |
int | getPeptideQuery (const int pepNumber) const |
Return the query number given the peptide 'number'. | |
char | getPeptideResidueAfter (const int pepNumber, const ms_peptide::PSM psmComponent=ms_peptide::PSM_COMPLETE) const |
Returns the residue immediately after the peptide. | |
char | getPeptideResidueBefore (const int pepNumber, const ms_peptide::PSM psmComponent=ms_peptide::PSM_COMPLETE) const |
Returns the residue immediately before the peptide. | |
bool | getPeptideShowCheckbox (const int pepNumber) const |
Returns true if a check box for repeat searches should be shown in a Mascot report. | |
long | getPeptideStart (const int pepNumber, const ms_peptide::PSM psmComponent=ms_peptide::PSM_COMPLETE) const |
Return the peptide start residue given the peptide 'number'. | |
int64_t | getProteinId () const |
Return the protein id. | |
int | getProteinSummaryHit () const |
For a protein from the protein summary only. | |
double | getRMSDeltas (const ms_mascotresults &results) const |
Return the RMS value of the deltas between the calculated and experimental value. | |
double | getScore () const |
Return the protein score for this protein. | |
double | getScoreWithET () const |
Return the protein score including ET matches for this protein. | |
int | getSimilarProteinDB () const |
Return the database index of a protein that contains the same set (or a superset of) of the peptides in this protein. | |
std::string | getSimilarProteinName () const |
Return the accession of a protein that contains the same set (or a superset of) of the peptides in this protein. | |
int | getSimilarProteins (std::vector< std::string > &accessions, std::vector< int > &dbIdxs) const |
Return a list of proteins that that contains the same set (or a superset of) of the peptides in this protein. | |
std::string | getUnmatchedMasses (ms_mascotresfilebase &resfile, const int numDecimalPlaces=2) const |
Return a list of comma separated experimental masses that don't match. | |
bool | isASimilarProtein (const ms_protein *prot, const ms_mascotresults *results, const bool groupByQueryNumber=false) |
Find a protein in the results. | |
bool | isPMFMixture () const |
Returns true if the 'protein' is actually a PMF mixture. | |
bool | isSimilarProtein (const std::string &acc, const int dbIdx) const |
Returns true if the specified protein has the sameset or a superset of peptides that this protein has. | |
bool | isUnigene () const |
Returns true if the 'protein' is actually a UniGene entry. | |
bool | isUpdateScoreFromPepScores () const |
\ Returns true if protein score is updated from peptides | |
ms_protein & | operator= (const ms_protein &right) |
C++ assignment operator. | |
void | setDB (int dbIdx) |
Set database index. | |
void | setPeptideIsBold (const int pepNumber) |
void | setPeptideShowCheckbox (const int pepNumber) |
void | setProteinId (int64_t proteinId) |
Set Protein id. | |
void | sortPeptides (const ms_mascotresults &results, bool keepAlive=false, int keepAlivePercent=0, const char *keepAliveAccession="", int keepAliveCount=0) |
Sorts the peptides into ascending query number. | |
Friends | |
bool | operator< (const ms_protein &lhs, const ms_protein &rhs) |
Protein objects perform a simple sort of themselves by database ID and then accession. | |
This class encapsulates a protein in the mascot results file.
Pointers to ms_protein objects are returned from ms_peptidesummary::getHit() or ms_proteinsummary::getHit(), so there should be no need to create one of these from outside the library.
Enum for getNumDistinctPeptides().
See Using enumerated values and static const ints in Perl, Java, Python and C#.
There are several possible defintions for 'distinct'!
One of more of these flags can be combined using a bitwise 'OR' operator to determine which peptide matches are treated as distinct matches when counting up matches. Imagine a protein that has the following matches
In this case:
For completeness, getNumDisplayPeptides() will return a count of 6 and getNumPeptides() could return a count of 6 or could return 7 if HSM*TMR and HSMTM*R (where the asterisk indicates the oxidised methioine) both appear in the top 10 matches to the final query. (Some of these functions apply a threshold to the match scores, so this example assumes either no threshold is used or all matches are above threshold.)
A further complication is uniqueness within the whole database search. A peptide sequence that is distinct in one protein hit may also appear in another protein hit in the search. If you specify the flag DPF_UNIQUE, then only peptide matches that are unique within the whole search are counted, subject to the other flags. For example, if DPF_SEQUENCE .OR. DPF_UNIQUE is specified in the example above, then getNumDistinctPeptides() may return 0, 1 or 2, depending on which other protein hits contain the distinct peptide sequences assigned to the current protein hit.
The MCP guidelines require a count of "the total number of peptides assigned to the protein. To compute this number, multiple matches to peptides with the same primary sequence count as one, even if they represent different charge states or modification states". Specify DPF_SEQUENCE by itself to obtain this value.
A flags value that does not include DPF_SEQUENCE is unlikely to give a useful return value from getNumDistinctPeptides().
Enumerator | |
---|---|
DPF_SEQUENCE | Peptide matches must have different primary sequences to be counted as distinct matches. |
DPF_CHARGE | Peptide matches must have different charge states to be counted as distinct matches. |
DPF_MODS | Peptide matches must have different modification states to be counted as distinct matches. |
DPF_UNIQUE | Peptide matches must be unique in the whole search to be counted as distinct matches. |
DPF_NODUPSAMEQUERY | Duplicate peptide matches from the same query should be excluded (see DUPE_DuplicateSameQuery ). |
enum DUPLICATE |
Enum for the each peptide in the protein to indicate if it is a duplicate.
See Using enumerated values and static const ints in Perl, Java, Python and C#.
A protein match is made up of one or more peptides. Duplicate peptides don't increase the coverage of the protein. They also do not increase the score except for MudPIT scoring.
enum GROUP |
Enum to say if a protein is similar to another higher scoring protein.
See Grouping proteins together and Using enumerated values and static const ints in Perl, Java, Python and C#.
Note that if there are say 3 proteins with the same 4 peptide matches, then the highest scoring protein will have GROUP_NO, and the other two will have GROUP_COMPLETE. Calling getSimilarProteinName() on the highest scoring protein will return an empty string. Calling it for the other two proteins will return the accession for the highest scoring protein.
enum MASS_FLAGS |
enum for each protein to specify what masses to select.
See Using enumerated values and static const ints in Perl, Java, Python and C#.
Only a subset of all masses is used for scoring proteins. However, all matching masses are usually reported for each protein. Using these flags one can specify more precisely what sub-set of masses one is interested in. The flags can be combined with binary OR ("|"-operator in C++).
ms_protein | ( | const ms_protein & | src | ) |
Copying constructor.
Calling this function ensures that all the data is loaded into memory in the case where ms_peptidesummary::MSPEPSUM_DISCARD_RELOADABLE is specified.
src | is the ms_protein object that will be copied. |
bool anyMatchToQuery | ( | const int | query | ) | const |
See if any match to this query.
This could be useful if you need to give a list of unmatched queries for a given protein.
query | Query number |
bool anyMatchToQueryAndP | ( | const int | query, |
const int | P | ||
) | const |
See if any match to this query and 'P' (rank / hit).
This is useful for finding if this protein matched an identical peptide to another protein.
query | query number |
P | rank number |
void copyFrom | ( | const ms_protein * | src | ) |
Copies all content from another instance of the class.
Calling this function ensures that all the data is loaded into memory in the case where ms_peptidesummary::MSPEPSUM_DISCARD_RELOADABLE is specified.
src | is a pointer to the source object |
std::string getAccession | ( | ) | const |
Return the accession string for a protein.
This will always be available, for every protein.
const ms_protein * getComponent | ( | const int | componentNumber | ) | const |
For UniGene and PMF mixture return the 'component' protein.
For Unigene and PMF mixture, each 'protein' is made up of a number of components. Call this method to get the protein 'components' that were used to make up this 'pseudo' protein.
See Peptide mass fingerprint mixtures and Maintaining object references: two rules of thumb.
componentNumber | must be in the range 1..getNumComponents(), or a null value will be returned. No error message is generated for an out of range call. |
long getCoverage | ( | ) | const |
Return the number of residues covered.
If two peptides overlap, then the overlapped ones are only counted once. Getting the coverage as a percentage is not possible from the results file because the length of the protein is not stored in the file. An approximate value could be calculated using ms_mascotresults::getProteinMass() and dividing by 110.
This function currently returns '0' for a PMF mixture and for a UniGene entry.
int getDB | ( | ) | const |
Return the index of the database where the sequence is found.
Use ms_searchparams::getDB() to retrieve the database name using the index returned by this function.
ms_peptide getDistinctPeptide | ( | int | distinctIndex, |
int | repeatIndex = 1 , |
||
bool | aboveThreshold = false , |
||
DISTINCT_PEPTIDE_FLAGS | flags = DPF_SEQUENCE |
||
) | const |
Return the peptide repeat of the distinct peptide in the protein's peptide matches.
getNumDistinctPeptides() and getNumDistinctPeptideRepeats() indicate how many distinct peptides the protein has and how often they recur.
The individual ocurrences of the peptide are sorted by their ions scores in decreasing order. The first one (repeatIndex = 1) is the occurrence with the highest score.
When determining the valid ranges for the indexes, the same values should be passed for the aboveThreshold and flags parameters.
This can be used to retrieve the peptides to construct a tree as seen on the Mascot Search Results report or the Mascot Distiller protein tab.
To create a tree that matches the Mascot Search Results report, set aboveThreshold to false and use flags DPF_SEQUENCE, DPF_CHARGE, DPF_MODS and DPF_NODUPSAMEQUERY.
To create a tree that matches the Mascot Distiller protein tab, set aboveThreshold to false and use flags DPF_SEQUENCE, DPF_MODS and DPF_NODUPSAMEQUERY.
The distinct peptides are ordered into increasing molecular weight (see ms_peptide::getMrCalc()). The lowest Mr(calc) is the first peptide (distinctIndex = 1).
The repeats are ordered into decreasing score (see ms_peptide::getIonsScore()). The highest score is always the first repeat (repeatIndex = 1).
distinctIndex | Index into a list of distinct peptides, 1..getNumDistinctPeptides(). |
repeatIndex | Index into a list of repeats of the distinct peptide, 1..getNumDistinctPeptideRepeats(). Set this to 1 to get the primary (highest score) instance of the distinct peptide. |
aboveThreshold | If true, the function will only count the number of peptides above the threshold. The threshold used will be ms_mascotresults::getPeptideIdentityThreshold() unless ms_peptidesummary::MSPEPSUM_USE_HOMOLOGY_THRESH was specified in the ms_peptidesummary constructor in which case the threshold is returned by ms_peptidesummary::getHomologyThreshold(). |
flags | - see ms_protein::DISTINCT_PEPTIDE_FLAGS for details. |
int getFrame | ( | ) | const |
Returns the frame number for the protein.
A value of -1 will be returned if the peptides come from different frames. For a protein database, a value of zero will be returned. Frames 1 to 3 are the 'forward' strand, and 4 to 6 are the 'reverse' strand.
ms_protein::GROUP getGrouping | ( | ) | const |
Returns a flag which shows if this protein only contain the same peptides as those in another protein.
See Grouping proteins together.
int getHitNumber | ( | ) | const |
Returns the hit number in the results list.
For a protein that is a subset, the value returned is the hit number of the 'main' protein.
void getIgnoredQPs | ( | std::vector< int > & | q, |
std::vector< int > & | p | ||
) | const |
Return a list of queries and ranks that would have been part of this protein hit had they not been removed by IgnoreIonsScoreBelow.
Note that this method is only useful when the search is an integrated spectral library search, the results file has been opened in integrated library mode (ms_peptidesummary::MSPEPSUM_SL_INTEGRATED) and IgnoreIonsScoreBelow is set to a non-zero value.
When all the requirements are met, this method returns the queries and ranks (q,p) of the peptide matches removed due to IgnoreIonsScoreBelow but which would have been part of this protein hit otherwise. The q,p values are needed when iterating over all peptide matches assigned to this protein hit, because evidence for a peptide sequence can come from either search engine.
For example, suppose query 4339 has two matches to the same sequence: significant rank 1 match from the FASTA file and non-significant rank 2 match from the spectral library. Let the rank 1 match be in FASTA accession FAS1 and the rank 2 match in library accession LIB1. Since the matches have the same sequence, the rank 1 match provides evidence for it (it is significant) and the sequence is used in protein grouping. FAS1 and LIB1 end up in the same family, since they match the same significant sequence.
Suppose the rank 2 match is now hidden due to IgnoreIonsScoreBelow. Suppose further that FAS1 ends up as a subset of LIB1. If you iterate over the visible peptide matches of LIB1, query 4339 is nowhere to be seen, because it's hidden. But FAS1 might not appear anywhere either, since it's a subset protein. Query 4339 won't be in the unassigned list, because its rank 1 match was used as peptide evidence in protein grouping.
The only way to discover query 4339 is by iterating over the ignored matches in LIB1. There, query 4339 has the rank 2 match, and you can inspect the other ranks in the query to discover the rank 1 match.
[out] | q | Vector in which the list of queries is returned. |
[out] | p | Vector in which the list of ranks is returned. |
int getLongestPeptideLen | ( | ) | const |
Return the length (in residues) of the longest peptide in the protein.
int getLongestSigPeptideLen | ( | ) | const |
Return the length (in residues) of the longest significant peptide in the protein.
The threshold can be the identity or the homology threshold and is determined using ms_mascotresults::getPeptideThreshold()
std::string getMasses | ( | ms_mascotresfilebase & | resfile, |
const ms_proteinsummary & | summary, | ||
const unsigned int | flags = MASS_SELECT_MATCH , |
||
const int | numDecimalPlaces = 2 |
||
) | const |
Return a list of comma separated experimental masses according to a specified filter.
This is useful for selecting and displaying the list of observed mass values that satisfy the selection criteria. A number of different flag combinations can produce various sets of masses. An incomplete list of possible combinations follows:
See Maintaining object references: two rules of thumb.
resfile | file object to extract information from. |
summary | summary-object to extract information from. |
flags | controls what masses should be returned (see ms_protein::MASS_FLAGS for the complete list of possible values). |
numDecimalPlaces | number of decimal places for a formatted mass. |
int getMemberNumber | ( | ) | const |
Returns the member number within a family in the results list.
For a protein that is not in a family, the value returned is 0. For a 'main' protein in a family, the value returned is 1. For other members of a family, the value returned is 2 or more.
double getNonMudpitScore | ( | ) | const |
Will only return a different score from getScore() if the MSRES_MUDPIT_PROTEIN_SCORE flag has been specified.
For a protein summary, this will be the same as returned by getScore().
For a peptide summary, if the ms_mascotresults::MSRES_MUDPIT_PROTEIN_SCORE has been specified as a flag when creating the ms_peptidesummary object, then the protein score will be calculated differently to offset some artifacts created when the number of spectra approaches the number of sequences in the database (e.g. for MudPIT data sets). See getScore() for details.
If the MUDPIT flag was specified, then the old score can be obtained using this function.
For a peptide summary where ms_mascotresults::MSRES_MUDPIT_PROTEIN_SCORE was not specified, this function will return the same value as getScore().
int getNumComponents | ( | ) | const |
For UniGene and PMF mixture, return number of 'component' proteins.
For UniGene and PMF mixture, each 'protein' is made up of a number of components. Call this method to see how many 'components' were used to make up this 'pseudo' protein. Then call getComponent() to get each of the proteins in turn.
For a 'real' protein (i.e not a mixture or UniGene entry), this method will return zero.
See Peptide mass fingerprint mixtures.
int getNumDisplayPeptides | ( | bool | aboveThreshold = false | ) | const |
Return the number of peptides excluding those that with duplicate matches to same query.
There can be multiple matches to a peptide from the same query; this will occur when there are matches with different mods or mods in different locations. In this case, it is normal to display them using the following loop:
for (int i=1; i <= prot->getNumPeptides(); i++) { int query = prot->getPeptideQuery(i); int p = prot->getPeptideP(i); if (p != -1 && query != -1 && prot->getPeptideDuplicate(i) != ms_protein::DUPE_DuplicateSameQuery) { // Display peptide match } }
For an error tolerant search, if the top match to a query is an error tolerant match, then the query does not contribute to the number of matches above the threshold even if the rank 2 match for the query is above the threshold.
aboveThreshold | If true, the function will only count the number of peptides above the threshold. The threshold used will be ms_mascotresults::getPeptideIdentityThreshold() unless ms_peptidesummary::MSPEPSUM_USE_HOMOLOGY_THRESH was specified in the ms_peptidesummary constructor in which case the threshold is returned by ms_peptidesummary::getHomologyThreshold(). |
int getNumDistinctPeptideRepeats | ( | int | distinctIndex, |
bool | aboveThreshold = false , |
||
DISTINCT_PEPTIDE_FLAGS | flags = DPF_SEQUENCE |
||
) | const |
Return the number of repeats of the distinct peptide in the protein's peptide matches.
getNumDistinctPeptides() returns the number of distinct peptides (distinct as defined by the calling parameters). Each distinct peptide may occur more than once in the complete set of peptide matches for the protein.
This returns the number of times the distinct peptide occurs. The individual occurrences can then be accessed with getDistinctPeptide().
When determining the valid ranges for the index, the same values should be passed for the aboveThreshold and flags parameters.
distinctIndex | Index into a list of distinct peptides, 1..getNumDistinctPeptides() |
aboveThreshold | If true, the function will only count the number of peptides above the threshold. The threshold used will be ms_mascotresults::getPeptideIdentityThreshold() unless ms_peptidesummary::MSPEPSUM_USE_HOMOLOGY_THRESH was specified in the ms_peptidesummary constructor in which case the threshold is returned by ms_peptidesummary::getHomologyThreshold(). |
flags | - see ms_protein::DISTINCT_PEPTIDE_FLAGS for details. |
int getNumDistinctPeptides | ( | bool | aboveThreshold = false , |
DISTINCT_PEPTIDE_FLAGS | flags = DPF_SEQUENCE |
||
) | const |
Return the number of distinct peptides in the protein sequence.
Useful, for example, for MCP reports.
aboveThreshold | If true, the function will only count the number of peptides above the threshold. The threshold used will be ms_mascotresults::getPeptideIdentityThreshold() unless ms_peptidesummary::MSPEPSUM_USE_HOMOLOGY_THRESH was specified in the ms_peptidesummary constructor in which case the threshold is returned by ms_peptidesummary::getHomologyThreshold(). |
flags | - see ms_protein::DISTINCT_PEPTIDE_FLAGS for details. |
int getNumObservedForEmPAI | ( | ) | const |
Return the number of peptides 'observed' for emPAI quantitation calculation.
This function should not normally be called directly but is called by ms_mascotresults::getProteinEmPAI
The count of observed peptides only includes peptide matches with scores at or above the homology threshold, or the identity threshold, if there is no homology threshold. Ishihama et. al. obtained best proportionality for a standard protein mixture by counting unique parent ions, including different charge states from the same peptide sequence. This function counts the number of unique parent ions ignoring charge state, which produces better results when the number of charge states is large (e.g. 2+, 3+, 4+, 5+, 6+ and 7+). The differences are negligible when the data are only singly or doubly charged.
This function will still return a value even if ms_mascotresults::isEmPAIallowed() returns false.
If the protein contains any intact crosslinks, the emPAI assumptions are no longer valid, so the count of peptides returned is 0.
The value returned is stored in the cache file when Using the pepsum cache (MSR and dat28)
int getNumPeptides | ( | ) | const |
Return the number of peptides that had a match in this protein.
This includes peptides that are duplicates. See also getNumDisplayPeptides().
int getPepNumber | ( | const int | q, |
const int | p | ||
) | const |
Return the pepNumber given query and rank.
A matched protein contains a number of peptides. Further information about the peptide in the context of the protein can be obtained by calling getPeptideFrame(), getPeptideStart(), getPeptideEnd() etc. These functions all require a pepNumber, and this may be found using this function in cases where only q and p are readily available.
q | is the query number in the range 1 to ms_mascotresfilebase::getNumQueries(). |
p | is the 'rank' number. For a peptide summary, the top 10 matches are saved and hence p would normally be in the range 1 to 10. See ms_peptidesummary::getMaxRankValue() and ms_proteinsummary::getMaxRankValue(). |
int getPeptideComponentID | ( | const int | pepNumber | ) | const |
Returns 0 except for a UniGene entry or a PMF mixture entry.
A matched protein contains a number of peptides. Further information about the peptide can be obtained by getting an ms_peptide object using getPeptideQuery(), getPeptideP() and ms_mascotresults::getPeptide().
If this protein is really just a UniGene entry, or a PMF mixture entry then it is not a 'real' protein but just a container for a number of component proteins. In this case, each peptide originates from one of the components. The 'real' protein that corresponds to each component can be found using getComponent().
In the case of the same peptide being found in multiple components, the ID returned will be any one of these components.
pepNumber | must be in the range 1 to getNumPeptides(). |
ms_protein::DUPLICATE getPeptideDuplicate | ( | const int | pepNumber, |
const bool | allowErrTolDuplicate = true |
||
) | const |
Return the DUPLICATE status given the peptide 'number'.
A matched protein contains a number of peptides. Further information about the peptide can be obtained by getting an ms_peptide object using getPeptideQuery(), getPeptideP() and ms_mascotresults::getPeptide().
However, duplicate peptides could possibly be different for different proteins, so this value is not available in the ms_peptide object, but can be found in here in the protein object.
The allowErrTolDuplicate parameter controls duplicate checking behaviour in error tolerant searches. When true (default), the peptide pepNum is compared to all first pass and second pass (ET) matches. If the peptide is flagged as a duplicate, it may be a duplicate of a first or a second pass match.
When allowErrTolDuplicate is false, the peptide pepNum is only compared to first pass matches. This special case is used during protein inference.
pepNumber | must be in the range 1 to getNumPeptides(). |
allowErrTolDuplicate | is true by default. See above for behaviour when set to false. |
long getPeptideEnd | ( | const int | pepNumber, |
const ms_peptide::PSM | psmComponent = ms_peptide::PSM_COMPLETE |
||
) | const |
Return the peptide end residue given the peptide 'number'.
A matched protein contains a number of peptides. Further information about the peptide can be obtained by getting an ms_peptide object using getPeptideQuery(), getPeptideP() and ms_mascotresults::getPeptide().
However, the same peptide sequence may occur in different places in different proteins, so the start and end residue information is not available in the ms_peptide object, but can be found in here using this function and getPeptideStart(). The returned number is 1 based.
The psmComponent argument is only meaningful in a crosslinked search. If the peptide match has an intact crosslink, psmComponent specifies the type of data to return — alpha peptide end or beta peptide end. If alpha or beta is in a different protein, or if you pass PSM_COMPLETE, the return value is -1.
If the peptide match is not crosslinked, psmComponent is ignored.
pepNumber | must be in the range 1 to getNumPeptides(). |
psmComponent | is the type of data to return: complete molecule, alpha peptide or beta peptide. |
int getPeptideFrame | ( | const int | pepNumber, |
const ms_peptide::PSM | psmComponent = ms_peptide::PSM_COMPLETE |
||
) | const |
Return the frame number given the peptide 'number'.
A matched protein contains a number of peptides. Further information about the peptide can be obtained by getting an ms_peptide object using getPeptideQuery(), getPeptideP() and ms_mascotresults::getPeptide().
However, the same peptide sequence may occur in different frames in different proteins, so the frame information is not available in the ms_peptide object, but can be found in here in the protein object.
The psmComponent argument is only meaningful in a crosslinked search. If the peptide match has an intact crosslink, psmComponent specifies the type of data to return — alpha peptide frame or beta peptide frame. If alpha or beta is in a different protein, or if you pass PSM_COMPLETE, the return value is -1.
If the peptide match is not crosslinked, psmComponent is ignored.
pepNumber | must be in the range 1 to getNumPeptides(). |
psmComponent | is the type of data to return: complete molecule, alpha peptide or beta peptide. |
double getPeptideIonsScore | ( | const int | pepNumber | ) | const |
Return the ions score within this protein context given the peptide 'number'.
A matched protein contains a number of peptides. Further information about the peptide can be obtained by getting an ms_peptide object using getPeptideQuery(), getPeptideP() and ms_mascotresults::getPeptide().
However, there are minor corrections to the score for each peptide depending on the protein that it is found in.
The Mascot results pages display the score returned from ms_peptide::getIonsScore() because results from similar proteins are displayed together. For ms_proteinsummary results, the return values from ms_peptide::getIonsScore() and ms_protein::getPeptideIonsScore() will be identical.
For an integrated error tolerant search where ms_mascotresults::MSRES_INTEGRATED_ERR_TOL is specified, the protein score is derived from the highest scoring non error tolerant match for each query, and this is the value returned by this function.
In an integrated spectral library search, if the peptide at pepNumber is a library match, the return values from ms_peptide::getIonsScore() and ms_protein::getPeptideIonsScore() will be identical. Library scores are not affected by multiplicity correction.
pepNumber | must be in the range 1 to getNumPeptides(). |
bool getPeptideIsBold | ( | const int | pepNumber | ) | const |
Returns true if this peptide should be displayed in bold in a Mascot report.
A matched protein contains a number of peptides. Further information about the peptide can be obtained by getting an ms_peptide object using getPeptideQuery(), getPeptideP() and ms_mascotresults::getPeptide().
This function returns true if this peptide should be displayed in bold in a Mascot report. Bold is used for the first time a query is shown in a report. See also ms_peptide::getFirstProtAppearedIn().
pepNumber | must be in the range 1 to getNumPeptides(). |
long getPeptideMultiplicity | ( | const int | pepNumber, |
const ms_peptide::PSM | psmComponent = ms_peptide::PSM_COMPLETE |
||
) | const |
Return the number of precursor matches in this protein for the specified peptide 'number'.
A matched protein contains a number of peptides. Further information about the peptide can be obtained by getting an ms_peptide object using getPeptideQuery(), getPeptideP() and ms_mascotresults::getPeptide().
The multiplicity value is the number of times that the precursor mass for the specified peptide got a match in this protein. With a tight tolerance and no variable modifications, this will normally be a small number. For a large protein, with no enzyme specificity and a large number of modifications (or an error tolerant search), this can be a large number.
The psmComponent argument is only meaningful in a crosslinked search. If the peptide match has an intact crosslink, psmComponent specifies the type of data to return — alpha peptide multiplicity or beta peptide multiplicity. If alpha or beta is in a different protein, or if you pass PSM_COMPLETE, the return value is -1.
If the peptide match is not crosslinked, psmComponent is ignored.
This value is used internally for standard protein scoring and is not normally required outside Mascot Parser.
pepNumber | must be in the range 1 to getNumPeptides(). |
psmComponent | is the type of data to return: complete molecule, alpha peptide or beta peptide. |
int getPeptideP | ( | const int | pepNumber | ) | const |
Return the 'rank' number given the peptide 'number'.
A matched protein contains a number of peptides. These peptides all originate from a 'query'. For peptide summary information, the top 10 scoring results are kept. This is the 'rank' number.
For protein summary information, 'P' refers to the protein hit number.
To get an ms_peptide object, call ms_mascotresults::getPeptide() using the return value from this function and the 'query' from ms_protein::getPeptideQuery().
pepNumber | must be in the range 1 to getNumPeptides(). |
int getPeptideQuery | ( | const int | pepNumber | ) | const |
Return the query number given the peptide 'number'.
A matched protein contains a number of peptides. These peptides all originate from a 'query'. The query number is returned by this function.
To get an ms_peptide object, call ms_mascotresults::getPeptide() using the return value from this function and the 'p' value from ms_protein::getPeptideP().
pepNumber | must be in the range 1 to getNumPeptides(). |
char getPeptideResidueAfter | ( | const int | pepNumber, |
const ms_peptide::PSM | psmComponent = ms_peptide::PSM_COMPLETE |
||
) | const |
Returns the residue immediately after the peptide.
A matched protein contains a number of peptides. Further information about the peptide can be obtained by getting an ms_peptide object using getPeptideQuery(), getPeptideP() and ms_mascotresults::getPeptide().
The residue before and after are only saved in the results files for Mascot 2.1 and later. For files created with earlier versions of Mascot, a '?' will be returned.
If the peptide is an C terminal peptide, then this function will return '-'.
If the search was against nucleic acid data and the peptide is just before a stop codon, then this function will return '@'.
The psmComponent argument is only meaningful in a crosslinked search. If the peptide match has an intact crosslink, psmComponent specifies the type of data to return — alpha peptide residue or beta peptide residue. If alpha or beta is in a different protein, or if you pass PSM_COMPLETE, the return value is '?'.
If the peptide match is not crosslinked, psmComponent is ignored.
pepNumber | must be in the range 1 to getNumPeptides(). |
psmComponent | is the type of data to return: complete molecule, alpha peptide or beta peptide. |
char getPeptideResidueBefore | ( | const int | pepNumber, |
const ms_peptide::PSM | psmComponent = ms_peptide::PSM_COMPLETE |
||
) | const |
Returns the residue immediately before the peptide.
A matched protein contains a number of peptides. Further information about the peptide can be obtained by getting an ms_peptide object using getPeptideQuery(), getPeptideP() and ms_mascotresults::getPeptide().
The residue before and after are only saved in the results files for Mascot 2.1 and later. For files created with earlier versions of Mascot, a '?' will be returned.
If the peptide is an N terminal peptide, then this function will return '-'.
If the search was against nucleic acid data and the peptide is just after a stop codon, then this function will return '@'.
The psmComponent argument is only meaningful in a crosslinked search. If the peptide match has an intact crosslink, psmComponent specifies the type of data to return — alpha peptide residue or beta peptide residue. If alpha or beta is in a different protein, or if you pass PSM_COMPLETE, the return value is '?'.
If the peptide match is not crosslinked, psmComponent is ignored.
pepNumber | must be in the range 1 to getNumPeptides() |
psmComponent | is the type of data to return: complete molecule, alpha peptide or beta peptide. |
bool getPeptideShowCheckbox | ( | const int | pepNumber | ) | const |
Returns true if a check box for repeat searches should be shown in a Mascot report.
A matched protein contains a number of peptides. Further information about the peptide can be obtained by getting an ms_peptide object using getPeptideQuery(), getPeptideP() and ms_mascotresults::getPeptide().
A check box is displayed if this is the first rank 1 match that has been displayed for this query. See also ms_peptide::getRank() and ms_peptide::getPrettyRank().
By definition, all unassigned queries will need a check box.
pepNumber | must be in the range 1 to getNumPeptides(). |
long getPeptideStart | ( | const int | pepNumber, |
const ms_peptide::PSM | psmComponent = ms_peptide::PSM_COMPLETE |
||
) | const |
Return the peptide start residue given the peptide 'number'.
A matched protein contains a number of peptides. Further information about the peptide can be obtained by getting an ms_peptide object using getPeptideQuery(), getPeptideP() and ms_mascotresults::getPeptide().
However, the same peptide sequence may occur in different places in different proteins, so the start and end residue information is not available in the ms_peptide object, but can be found in here using this function and getPeptideEnd(). The returned number is 1 based.
The psmComponent argument is only meaningful in a crosslinked search. If the peptide match has an intact crosslink, psmComponent specifies the type of data to return — alpha peptide start or beta peptide start. If alpha or beta is in a different protein, or if you pass PSM_COMPLETE, the return value is -1.
If the peptide match is not crosslinked, psmComponent is ignored.
A value of -1 is returned if there is an error.
pepNumber | must be in the range 1 to getNumPeptides(). |
psmComponent | is the type of data to return: complete molecule, alpha peptide or beta peptide. |
int64_t getProteinId | ( | ) | const |
Return the protein id.
Unique identifier for proteins used in msr format
int getProteinSummaryHit | ( | ) | const |
For a protein from the protein summary only.
There should be no real reason to use this method outside the library, apart from when determining a value to pass as the singleHit parameter when calling ms_proteinsummary::ms_proteinsummary().
If the protein came from the summary section (or mixture section) then this will return the hit number. For a protein that came from a ms_peptidesummary, this will return zero.
Used within the library for getting the sort order for a PMF the same as in the results file (only an issue for proteins with a similar score).
double getRMSDeltas | ( | const ms_mascotresults & | results | ) | const |
Return the RMS value of the deltas between the calculated and experimental value.
The value is returned in ppm.
results | reference to an ms_mascotresults object |
double getScore | ( | ) | const |
Return the protein score for this protein.
Two protein scoring algorithms are available: MudPIT scoring (recommended) and standard scoring. For a protein summary, only standard scoring is supported.
If the flag ms_mascotresults::MSRES_MUDPIT_PROTEIN_SCORE is specified, protein score is calculated by:
Protein score = 0 For each peptide match { If there is a homology threshold and ions score > homology threshold { Protein score += ions score - homology threshold } else if ions score > identity threshold { Protein score += ions score - identity threshold } } Protein score += 1 * average of all the subtracted thresholds
In spectral library searches (Parser 2.6 and later), the algorithm is the same but the score excess over threshold has a different form depending on library mode:
In versions prior to 2.2, the thresholds were not affected by the minProbability
parameter of the ms_peptidesummary constructor; a default value of 1 in 20 was always used.
The standard protein score is the sum of ions scores for each match. For duplicate peptides, just the highest score is taken.
The standard score can be lower than the sum of the ions scores, particularly when the protein is large. This is because a correction is applied to compensate for the accumulation of random ions scores from random matches. The difference is more substantial when doing a no-enzyme search, because there are orders of magnitude more random matches. See the function ms_mascotresults::getIonsScoreCorrected() for details of how the correction is calculated. If the correction causes the ions score to become negative, then this ions score is ignored when calculating the protein score.
Note that the match score correction is only used with standard scoring. MudPIT scoring always uses the uncorrected score.
double getScoreWithET | ( | ) | const |
Return the protein score including ET matches for this protein.
Returns the score of the protein including ET matches This is used mainly for Family grouping, to differentiate between between two otherwise identical proteins
int getSimilarProteinDB | ( | ) | const |
Return the database index of a protein that contains the same set (or a superset of) of the peptides in this protein.
This function returns a single protein database ID. When using the ms_mascotresults::MSRES_CLUSTER_PROTEINS flag, a subset protein may be a subset of more than one parent protein. To find the complete list of proteins that it is a subset of, call getSimilarProteins(). This function just returns the 'first' protein in the list of superset proteins. There will only be a multiple 'similar' proteins in cases where ms_mascotresults::MSRES_CLUSTER_PROTEINS is specified and where this protein is a GROUP_SUBSET. There will only be a single 'similar' protein where this protein is GROUP_COMPLETE.
See Grouping proteins together, getGrouping() and getSimilarProteinName().
std::string getSimilarProteinName | ( | ) | const |
Return the accession of a protein that contains the same set (or a superset of) of the peptides in this protein.
This function returns a single protein accession string. When using the ms_mascotresults::MSRES_CLUSTER_PROTEINS flag, a subset protein may be a subset of more than one parent protein. To find the complete list of proteins that it is a subset of, call getSimilarProteins(). This function just returns the 'first' protein in the list of superset proteins. There will only be a multiple 'similar' proteins in cases where ms_mascotresults::MSRES_CLUSTER_PROTEINS is specified and where this protein is a GROUP_SUBSET. There will only be a single 'similar' protein where this protein is GROUP_COMPLETE.
See Grouping proteins together, getGrouping() and getSimilarProteinDB().
int getSimilarProteins | ( | std::vector< std::string > & | accessions, |
std::vector< int > & | dbIdxs | ||
) | const |
Return a list of proteins that that contains the same set (or a superset of) of the peptides in this protein.
See Using MSRES_CLUSTER_PROTEINS.
accessions | Is the list of accessions for which this protein is a sameset or a subset. See Using STL vector classes vectori, vectord and VectorString in Perl, Java, Python and C#. |
dbIdxs | Is the corresponding list of databases for the accessions. This array will be the same size as the accessions array. For a search against a single database, all the IDs will be 1. See Using STL vector classes vectori, vectord and VectorString in Perl, Java, Python and C#. |
std::string getUnmatchedMasses | ( | ms_mascotresfilebase & | resfile, |
const int | numDecimalPlaces = 2 |
||
) | const |
Return a list of comma separated experimental masses that don't match.
This is useful for displaying the list of observed mass values that failed to get a match to a protein hit in a PMF (listed at end of each hit in a protein summary report).
resfile | reference to a ms_mascotresfilebase object |
numDecimalPlaces | decimal precision |
bool isASimilarProtein | ( | const ms_protein * | prot, |
const ms_mascotresults * | results, | ||
const bool | groupByQueryNumber = false |
||
) |
Find a protein in the results.
The function looks to see if 'self' contains the same set or a subset of matching peptides as the passed 'prot'. If it does, then it sets its group to be GROUP_COMPLETE or GROUP_SUBSET and also sets the similar protein accession.
prot | Is the protein to compare. |
results | Need to be passed for access to the peptide information. |
groupByQueryNumber | Is used to determine whether peptide similarity is just by query number (for PMF) or by peptide string for MS-MS. |
bool isPMFMixture | ( | ) | const |
Returns true if the 'protein' is actually a PMF mixture.
To find out what 'component' proteins were used to get this entry, see getComponent() and Peptide mass fingerprint mixtures.
bool isUpdateScoreFromPepScores | ( | ) | const |
\ Returns true if protein score is updated from peptides
TODO
void setDB | ( | int | idx | ) |
Set database index.
idx | database index from 1 to ms_searchparams::getNumberOfDatabases(). |
void setPeptideIsBold | ( | const int | pepNumber | ) |
A matched protein contains a number of peptides. Further information about the peptide can be obtained by getting an ms_peptide object using getPeptideQuery(), getPeptideP() and ms_mascotresults::getPeptide().
pepNumber | must be in the range 1 to getNumPeptides(). |
void setPeptideShowCheckbox | ( | const int | pepNumber | ) |
A matched protein contains a number of peptides. Further information about the peptide can be obtained by getting an ms_peptide object using getPeptideQuery(), getPeptideP() and ms_mascotresults::getPeptide().
A check box is displayed if this is the first rank 1 match that has been displayed for this query. See also ms_peptide::getRank() and ms_peptide::getPrettyRank().
By definition, all unassigned queries will need a check box.
pepNumber | must be in the range 1 to getNumPeptides(). |
void setProteinId | ( | int64_t | proteinId | ) |
Set Protein id.
Set the unique identifier for proteins used in msr format
proteinId | Unique id retrieved from the msr file. |
|
friend |
Protein objects perform a simple sort of themselves by database ID and then accession.
Final sorting for proteins by score and then accession is more complex.
lhs | left element to compare |
rhs | right element to compare |