forked from gigablast/open-source-search-engine
-
Notifications
You must be signed in to change notification settings - Fork 0
/
Cachedb.cpp
175 lines (161 loc) · 5.11 KB
/
Cachedb.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
#include "Cachedb.h"
#include "Threads.h"
// for seo-related objects:
Cachedb g_cachedb;
// for seo serps:
Cachedb g_serpdb;
void Cachedb::reset() {
m_rdb.reset();
}
bool Cachedb::init ( ) {
// we use the same disk page size as indexdb (for rdbmap.cpp)
//int32_t pageSize = GB_INDEXDB_PAGE_SIZE;
// set this for debugging
//int64_t maxTreeMem = 1000000;
// i've seen some debug entries like 33MB because of
// m_debugDocIdScoreBuf and m_origDocIdScoreBuf take up so much space!
// so don't cache those any more!!
int64_t maxTreeMem = 40000000; // 40MB g_serpdb, 40MB g_cachedb
// . what's max # of tree nodes?
// . key+4+left+right+parents+dataPtr = sizeof(key96_t)+4 +4+4+4+4
// . 32 bytes per record when in the tree
// . >1000 bytes of data per rec
int32_t maxTreeNodes = maxTreeMem /(sizeof(key96_t)+16+1000);
// disk page cache mem, 100MB on gk0 now
//int32_t pcmem = 0; // g_conf.m_cachedbMaxDiskPageCacheMem;
// keep this low if we are the tmp cluster
//if ( g_hostdb.m_useTmpCluster ) pcmem = 0;
// TODO: would be nice to just do page caching on the satellite files;
// look into "minimizeDiskSeeks" at some point...
m_name = "cachedb";
m_rdbId = RDB_CACHEDB;
if ( this == &g_serpdb ) {
m_name = "serpdb";
m_rdbId = RDB_SERPDB;
}
// if ( ! m_pc.init ( m_name ,
// m_rdbId, // RDB_CACHEDB,
// pcmem ,
// pageSize ))
// return log("db: %s init failed.",m_name);
// init the rdb
if ( ! m_rdb.init ( g_hostdb.m_dir ,
m_name ,
true , // dedup
-1 , // fixeddatasize is 0 since no data
4,//g_conf.m_cachedbMinFilesToMerge ,
// fix this to 15 and rely on the page cache of
// just the satellite files and the daily merge to
// keep things fast.
//15 ,
maxTreeMem ,
maxTreeNodes ,
true , //isTreeBalanced
0 , // cache mem
0 , // cache nodes
false, // true , // use half keys
false , // load cache from disk
NULL,//&m_pc ,
false , // false
false , // preload page cache
sizeof(key96_t) ,
true , // bias page cache? (true!)
true )) // is collectionless???? !!!!
return false;
// add the base since it is a collectionless rdb
return m_rdb.addRdbBase1 ( NULL );
}
/*
bool Cachedb::addColl ( char *coll, bool doVerify ) {
if ( ! m_rdb.addColl ( coll ) ) return false;
if ( ! doVerify ) return true;
// verify
if ( verify(coll) ) return true;
// if not allowing scale, return false
if ( ! g_conf.m_allowScale ) return false;
// otherwise let it go
log ( "db: Verify failed, but scaling is allowed, passing." );
return true;
}
*/
bool Cachedb::verify ( char *coll ) {
// coll is NULL here methinks
log ( LOG_DEBUG, "db: Verifying %s...",m_name );
g_threads.disableThreads();
Msg5 msg5;
Msg5 msg5b;
RdbList list;
key224_t startKey;
key224_t endKey;
startKey.setMin();
endKey.setMax();
int32_t minRecSizes = 64000;
CollectionRec *cr = g_collectiondb.getRec(coll);
if ( ! msg5.getList ( m_rdbId,//RDB_CACHEDB ,
cr->m_collnum ,
&list ,
(char*)&startKey ,
(char*)&endKey ,
minRecSizes ,
true , // includeTree ,
false , // add to cache?
0 , // max cache age
0 , // startFileNum ,
-1 , // numFiles ,
NULL , // state
NULL , // callback
0 , // niceness
false , // err correction?
NULL ,
0 ,
-1 ,
true ,
-1LL ,
&msg5b ,
true )) {
g_threads.enableThreads();
return log("db: HEY! it did not block");
}
int32_t count = 0;
int32_t got = 0;
for ( list.resetListPtr() ; ! list.isExhausted() ;
list.skipCurrentRecord() ) {
key224_t k;
list.getCurrentKey((char*)&k);
count++;
uint32_t shardNum = getShardNum ( m_rdbId , &k );//RDB_CACHEDB
if ( shardNum == getMyShardNum() ) got++;
}
if ( got != count ) {
log ("db: Out of first %"INT32" records in %s , "
"only %"INT32" belong to our group.",count,m_name,got);
/*
// repeat with log
for ( list.resetListPtr() ; ! list.isExhausted() ;
list.skipCurrentRecord() ) {
key224_t k;
list.getCurrentKey((char*)&k);
uint32_t shardNum = getShardNum ( RDB_CACHEDB , &k );
int32_t groupNum = g_hostdb.getGroupNum(groupId);
uint32_t sh32 ;
sh32 = g_cachedb.getLinkeeSiteHash32_uk(&k);
uint16_t sh16 = sh32 >> 19;
log("db: sh16=0x%"XINT32" group=%"INT32"",
(int32_t)sh16,groupNum);
}
*/
// exit if NONE, we probably got the wrong data
if ( got == 0 ) log("db: Are you sure you have the "
"right "
"data in the right directory? "
"Exiting.");
log ( "db: Exiting due to inconsistency.");
g_threads.enableThreads();
return g_conf.m_bypassValidation;
}
log ( LOG_DEBUG, "db: %s passed verification successfully for "
"%"INT32" recs.", m_name,count );
// DONE
g_threads.enableThreads();
return true;
}