Matrix Science Mascot Parser toolkit
 
Loading...
Searching...
No Matches
ms_cdb_sharding.hpp
1/*
2##############################################################################
3# file: ms_cdb_sharding.hpp #
4# 'msparser' toolkit #
5# Sharding of the cdb files based on the content and the size #
6# #
7##############################################################################
8# COPYRIGHT NOTICE #
9# Copyright 1998-2021 Matrix Science Limited All Rights Reserved. #
10# #
11##############################################################################
12# $Author: francoisr@matrixscience.com $
13# $Date: 2022-02-18 16:21:35 +0000 $
14# $Revision: b37898d7f040eb50775669c2cfd1f9c88ef4d2da | MSPARSER_REL_3_0_0-2024-09-24-0-g93ebaeb4f4 $
15##############################################################################
16*/
17
18#ifndef MS_CDB_SHARDING_HPP
19#define MS_CDB_SHARDING_HPP
20
21#ifndef SWIG
22
23using namespace matrix_science;
24
25namespace msparser_internal {
26
27 class CdbPreparingHelper
28 {
29 public:
38 static bool moveFileSafely(const std::string & sourceFile, const std::string & targetFile, std::string & strError);
39
49 static bool archiveFileToDirectory(const std::string & archiveDir, const std::string & sourceFile, const std::string & targetFile, std::string & strError);
50
60 static bool moveAllFilesInDirectory(const std::string & sourceDir, const std::string & targetDir, std::string & strError, const ms_taxonomyrules * taxRules = nullptr);
61 };
62
63 class CdbShard : public ms_errors
64 {
65 public:
66
76 CdbShard(const std::string & baseFileName, const char letter, const unsigned numCbdFiles, const bool keepFilesOpen, const bool splitShards);
77
78 ~CdbShard();
79
85 bool isGood() const { return success_; };
86
91 void prepareFiles();
92
97 void finaliseFiles();
98
104 std::string openFiles();
105
110 void closeFiles();
111
116 void retargetFiles(const std::string & targetBaseFileName, const std::string & compressionDir, const std::string & targetDir);
117
123 char getLetter() const { return letter_; };
124
130 int getNumCdb() const { return static_cast<int>(vecCdbFiles_.size()); };
131
138 int getIntValue(const std::string & key) const;
139
147 bool setIntValue(const std::string & key, const int intValue);
148
149 private:
150 std::string getCdbFileName(char c, int num) const;
151
152 std::string baseFileName_;
153 char letter_;
154 bool success_;
155 UINT64 currentCdbSize_;
156 unsigned currentCdbFileIdx_;
157 unsigned numCdbFiles_; //only relevant when reading if splitShards is true
158
159 //performance tuning
160 bool keepFilesOpen_;
161 bool splitShards_;
162
163 std::vector<matrix_science::ms_tinycdb *> vecCdbFiles_;
164 };
165
166 class CdbSharding : public ms_errors
167 {
168 public:
169
178 CdbSharding(const std::string & baseFileName, const bool keepFilesOpen, const bool splitShards, const std::string preparingDir = ".");
179
180 ~CdbSharding();
181
187 bool isGood() const;
188
194 bool mustCreate() { return mustCreate_; }
195
201 bool prepareShards();
202
208 std::string getWorkingBaseFileName();
209
215 bool finaliseShards();
216
222 bool openShards();
223
228 void closeShards();
229
236 int getIntValue(const std::string & accession) const;
237
245 bool setIntValue(const std::string & accession, const int intValue);
246
247 private:
248 bool deletePreviousFiles();
249 bool isNumberOrLetter(const char c) const;
250 CdbShard * getShardForAccession(const std::string & accession) const;
251
252 std::string baseFileName_;
253 bool success_;
254
255 bool mustCreate_;
256
257 //performance tuning
258 bool keepFilesOpen_;
259 bool splitShards_;
260
261 matrix_science::ms_tinycdb * mainCdbFile_; //this is used to store how many cdb files each shard has
262 std::vector<CdbShard *> vecCharToShard_;
263 std::vector<CdbShard *> vecShards_;
264
265 std::string preparingDir_;
266 std::string sourceDir_;
267 std::string baseFileName_withoutPath_;
268 bool differentCompressionDir_;
269 std::string working_baseFileName_;
270 };
271
272} // namespace msparser_internal
273
274#endif // SWIG
275
276#endif // MS_CDB_SHARDING_HPP
This class is used as a base class for several Mascot Parser classes.
Definition: ms_errors.hpp:696
This class represents a single Taxonomy_XXX section in mascot.dat.
Definition: ms_taxonomyrules.hpp:247
Wrapper for the public domain tinycdb package http://www.corpit.ru/mjt/tinycdb.html by Michael Tokare...
Definition: ms_tinycdb.hpp:124