Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Changes In Branch compression-hooks Excluding Merge-Ins
This is equivalent to a diff from a7de625f13 to 676da8516d
2012-10-28
| ||
11:38 | Merge compression-hooks branch with trunk. check-in: a701b281e9 user: dan tags: trunk | |
11:34 | Turn off LSM_CONFIG_MMAP automatically in compressed database mode. Leaf check-in: 676da8516d user: dan tags: compression-hooks | |
11:28 | Enhance the file-format to allow padding records smaller than 6 bytes in length. check-in: 2ba0368e76 user: dan tags: compression-hooks | |
2012-10-17
| ||
19:29 | Fix various issues with tcl test cases. check-in: ae7d9c68ef user: dan tags: trunk | |
2012-10-16
| ||
15:26 | Change page numbers to 8-byte numbers (from 4). This is required to support compressed databases, where a page number is a byte offset in the database file. check-in: 5d266a717d user: dan tags: compression-hooks | |
2012-10-15
| ||
19:36 | Merge range-delete branch back into trunk. check-in: a7de625f13 user: dan tags: trunk | |
19:34 | Fix a case in live-recovery from a writer crash. Leaf check-in: 80abdbea2d user: dan tags: range-delete | |
2012-10-03
| ||
09:24 | Minor changes to the lsmperf.tcl script. check-in: 45e59053e7 user: dan tags: trunk | |
Changes to lsm-test/lsmtest.h.
︙ | ︙ | |||
74 75 76 77 78 79 80 81 82 83 84 85 86 87 | ** Functions in wrapper3.c. This file contains the tdb wrapper for lsm. ** The wrapper for lsm is a bit more involved than the others, as it ** includes code for a couple of different lsm configurations, and for ** various types of fault injection and robustness testing. */ int test_lsm_open(const char *zFilename, int bClear, TestDb **ppDb); int test_lsm_lomem_open(const char *zFilename, int bClear, TestDb **ppDb); int test_lsm_small_open(const char *zFilename, int bClear, TestDb **ppDb); int test_lsm_mt2(const char *zFilename, int bClear, TestDb **ppDb); int test_lsm_mt3(const char *zFilename, int bClear, TestDb **ppDb); /* Functions in testutil.c. */ int testPrngInit(void); u32 testPrngValue(u32 iVal); | > | 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 | ** Functions in wrapper3.c. This file contains the tdb wrapper for lsm. ** The wrapper for lsm is a bit more involved than the others, as it ** includes code for a couple of different lsm configurations, and for ** various types of fault injection and robustness testing. */ int test_lsm_open(const char *zFilename, int bClear, TestDb **ppDb); int test_lsm_lomem_open(const char *zFilename, int bClear, TestDb **ppDb); int test_lsm_zip_open(const char *zFilename, int bClear, TestDb **ppDb); int test_lsm_small_open(const char *zFilename, int bClear, TestDb **ppDb); int test_lsm_mt2(const char *zFilename, int bClear, TestDb **ppDb); int test_lsm_mt3(const char *zFilename, int bClear, TestDb **ppDb); /* Functions in testutil.c. */ int testPrngInit(void); u32 testPrngValue(u32 iVal); |
︙ | ︙ |
Changes to lsm-test/lsmtest2.c.
︙ | ︙ | |||
200 201 202 203 204 205 206 | ** This function is a no-op if *pRc is non-zero when it is called. ** ** Open the LSM database identified by zFile and compute its checksum ** (a string, as returned by testCksumDatabase()). If the checksum is ** identical to zExpect1 or, if it is not NULL, zExpect2, the test passes. ** Otherwise, print an error message and set *pRc to 1. */ | | > | | 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 | ** This function is a no-op if *pRc is non-zero when it is called. ** ** Open the LSM database identified by zFile and compute its checksum ** (a string, as returned by testCksumDatabase()). If the checksum is ** identical to zExpect1 or, if it is not NULL, zExpect2, the test passes. ** Otherwise, print an error message and set *pRc to 1. */ static void testCompareCksumLsmdb( const char *zFile, /* Path to LSM database */ int bCompress, /* True if db is compressed */ const char *zExpect1, /* Expected checksum 1 */ const char *zExpect2, /* Expected checksum 2 (or NULL) */ int *pRc /* IN/OUT: Test case error code */ ){ if( *pRc==0 ){ char zCksum[TEST_CKSUM_BYTES]; TestDb *pDb; *pRc = tdb_lsm_open((bCompress?"compression=1 mmap=0":""), zFile, 0, &pDb); testCksumDatabase(pDb, zCksum); testClose(&pDb); if( *pRc==0 ){ int r1 = 0; int r2 = -1; |
︙ | ︙ | |||
245 246 247 248 249 250 251 | ** should really be in this file. *************************************************************************/ /* ** This test verifies that if a system crash occurs while doing merge work ** on the db, no data is lost. */ | | < > > > > > > > > | | | | | | > | | 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 | ** should really be in this file. *************************************************************************/ /* ** This test verifies that if a system crash occurs while doing merge work ** on the db, no data is lost. */ static void crash_test1(int bCompress, int *pRc){ const char *DBNAME = "testdb.lsm"; const DatasourceDefn defn = {TEST_DATASOURCE_RANDOM, 12, 16, 200, 200}; const int nRow = 5000; /* Database size */ const int nIter = 200; /* Number of test iterations */ const int nWork = 20; /* Maximum lsm_work() calls per iteration */ const int nPage = 15; /* Pages per lsm_work call */ int i; int iDot = 0; Datasource *pData; CksumDb *pCksumDb; TestDb *pDb; char *zCfg; const char *azConfig[2] = { "page_size=1024 block_size=65536 write_buffer=16384 safety=2 mmap=0", "page_size=1024 block_size=65536 write_buffer=16384 safety=2 " " compression=1 mmap=0" }; assert( bCompress==0 || bCompress==1 ); /* Allocate datasource. And calculate the expected checksums. */ pData = testDatasourceNew(&defn); pCksumDb = testCksumArrayNew(pData, nRow, nRow, 1); /* Setup and save the initial database. */ zCfg = testMallocPrintf("%s nmerge=7", azConfig[bCompress]); testSetupSavedLsmdb(zCfg, DBNAME, pData, 5000, pRc); testFree(zCfg); for(i=0; i<nIter && *pRc==0; i++){ int iWork; int testrc = 0; testCaseProgress(i, nIter, testCaseNDot(), &iDot); /* Restore and open the database. */ testRestoreLsmdb(DBNAME); testrc = tdb_lsm_open(azConfig[bCompress], DBNAME, 0, &pDb); assert( testrc==0 ); /* Call lsm_work() on the db */ tdb_lsm_prepare_sync_crash(pDb, 1 + (i%(nWork*2))); for(iWork=0; testrc==0 && iWork<nWork; iWork++){ int nWrite = 0; lsm_db *db = tdb_lsm(pDb); testrc = lsm_work(db, 0, nPage, &nWrite); assert( testrc!=0 || nWrite>0 ); if( testrc==0 ) testrc = lsm_checkpoint(db, 0); } tdb_close(pDb); /* Check that the database content is still correct */ testCompareCksumLsmdb(DBNAME, bCompress, testCksumArrayGet(pCksumDb, nRow), 0, pRc); } testCksumArrayFree(pCksumDb); testDatasourceFree(pData); } /* ** This test verifies that if a system crash occurs while committing a ** transaction to the log file, no earlier transactions are lost or damaged. */ static void crash_test2(int bCompress, int *pRc){ const char *DBNAME = "testdb.lsm"; const DatasourceDefn defn = {TEST_DATASOURCE_RANDOM, 12, 16, 1000, 1000}; const int nIter = 200; const int nInsert = 20; int i; |
︙ | ︙ | |||
349 350 351 352 353 354 355 | testDatasourceEntry(pData, 100+iIns, &pKey, &nKey, &pVal, &nVal); testrc = tdb_write(pDb, pKey, nKey, pVal, nVal); if( testrc ) break; } tdb_close(pDb); /* Check that no data was lost when the system crashed. */ | | | | 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 | testDatasourceEntry(pData, 100+iIns, &pKey, &nKey, &pVal, &nVal); testrc = tdb_write(pDb, pKey, nKey, pVal, nVal); if( testrc ) break; } tdb_close(pDb); /* Check that no data was lost when the system crashed. */ testCompareCksumLsmdb(DBNAME, bCompress, testCksumArrayGet(pCksumDb, 100 + iIns), testCksumArrayGet(pCksumDb, 100 + iIns + 1), pRc ); } testDatasourceFree(pData); testCksumArrayFree(pCksumDb); } /* ** This test verifies that if a system crash occurs when checkpointing ** the database, data is not lost (assuming that any writes not synced ** to the db have been synced into the log file). */ static void crash_test3(int bCompress, int *pRc){ const char *DBNAME = "testdb.lsm"; const int nIter = 100; const DatasourceDefn defn = {TEST_DATASOURCE_RANDOM, 12, 16, 1000, 1000}; int i; int iDot = 0; Datasource *pData; |
︙ | ︙ | |||
399 400 401 402 403 404 405 | /* Schedule a crash simulation then close the db. */ tdb_lsm_prepare_sync_crash(pDb, 1 + (i%2)); tdb_close(pDb); /* Open the database and check that the crash did not cause any ** data loss. */ | | | > | > | | | | 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 | /* Schedule a crash simulation then close the db. */ tdb_lsm_prepare_sync_crash(pDb, 1 + (i%2)); tdb_close(pDb); /* Open the database and check that the crash did not cause any ** data loss. */ testCompareCksumLsmdb(DBNAME, bCompress, testCksumArrayGet(pCksumDb, 110 + iOpen*10), 0, pRc ); } } testDatasourceFree(pData); testCksumArrayFree(pCksumDb); } void do_crash_test(const char *zPattern, int *pRc){ struct Test { const char *zTest; void (*x)(int, int *); int bCompress; } aTest [] = { { "crash.lsm.1", crash_test1, 0 }, { "crash.lsm_zip.1", crash_test1, 1 }, { "crash.lsm.2", crash_test2, 0 }, { "crash.lsm.3", crash_test3, 0 }, }; int i; for(i=0; *pRc==LSM_OK && i<ArraySize(aTest); i++){ struct Test *p = &aTest[i]; if( testCaseBegin(pRc, zPattern, "%s", p->zTest) ){ p->x(p->bCompress, pRc); testCaseFinish(*pRc); } } } |
Changes to lsm-test/lsmtest_main.c.
︙ | ︙ | |||
1329 1330 1331 1332 1333 1334 1335 | if( pClose ) fclose(pClose); pEnv->xClose(pOut); return rc; } static int do_insert(int nArg, char **azArg){ | < | | < > > > > | > > < < < | 1329 1330 1331 1332 1333 1334 1335 1336 1337 1338 1339 1340 1341 1342 1343 1344 1345 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 | if( pClose ) fclose(pClose); pEnv->xClose(pOut); return rc; } static int do_insert(int nArg, char **azArg){ const char *zDb = "lsm"; TestDb *pDb = 0; int i; int rc; const int nRow = 1 * 1000 * 1000; DatasourceDefn defn = { TEST_DATASOURCE_RANDOM, 8, 15, 80, 150 }; Datasource *pData = 0; if( nArg>1 ){ testPrintError("Usage: insert ?DATABASE?\n"); return 1; } if( nArg==1 ){ zDb = azArg[0]; } testMallocUninstall(tdb_lsm_env()); for(i=0; zDb[i] && zDb[i]!='='; i++); if( zDb[i] ){ rc = tdb_lsm_open(zDb, "testdb.lsm", 1, &pDb); }else{ rc = tdb_open(zDb, 0, 1, &pDb); } if( rc!=0 ){ testPrintError("Error opening db \"%s\": %d\n", zDb, rc); }else{ InsertWriteHook hook; memset(&hook, 0, sizeof(hook)); hook.pOut = fopen("writelog.txt", "w"); pData = testDatasourceNew(&defn); tdb_lsm_config_work_hook(pDb, do_insert_work_hook, 0); tdb_lsm_write_hook(pDb, do_insert_write_hook, (void *)&hook); if( rc==0 ){ for(i=0; i<nRow; i++){ void *pKey; int nKey; /* Database key to insert */ void *pVal; int nVal; /* Database value to insert */ testDatasourceEntry(pData, i, &pKey, &nKey, &pVal, &nVal); tdb_write(pDb, pKey, nKey, pVal, nVal); |
︙ | ︙ |
Changes to lsm-test/lsmtest_tdb.c.
︙ | ︙ | |||
609 610 611 612 613 614 615 616 617 618 619 620 621 622 | const char *zName; const char *zDefaultDb; int (*xOpen)(const char *zFilename, int bClear, TestDb **ppDb); } aLib[] = { { "sqlite3", "testdb.sqlite", sql_open }, { "lsm_small", "testdb.lsm_small", test_lsm_small_open }, { "lsm_lomem", "testdb.lsm_lomem", test_lsm_lomem_open }, { "lsm", "testdb.lsm", test_lsm_open }, #ifdef LSM_MUTEX_PTHREADS { "lsm_mt2", "testdb.lsm_mt2", test_lsm_mt2 }, { "lsm_mt3", "testdb.lsm_mt3", test_lsm_mt3 }, #endif #ifdef HAVE_LEVELDB { "leveldb", "testdb.leveldb", test_leveldb_open }, | > > > | 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 | const char *zName; const char *zDefaultDb; int (*xOpen)(const char *zFilename, int bClear, TestDb **ppDb); } aLib[] = { { "sqlite3", "testdb.sqlite", sql_open }, { "lsm_small", "testdb.lsm_small", test_lsm_small_open }, { "lsm_lomem", "testdb.lsm_lomem", test_lsm_lomem_open }, #ifdef HAVE_ZLIB { "lsm_zip", "testdb.lsm_zip", test_lsm_zip_open }, #endif { "lsm", "testdb.lsm", test_lsm_open }, #ifdef LSM_MUTEX_PTHREADS { "lsm_mt2", "testdb.lsm_mt2", test_lsm_mt2 }, { "lsm_mt3", "testdb.lsm_mt3", test_lsm_mt3 }, #endif #ifdef HAVE_LEVELDB { "leveldb", "testdb.leveldb", test_leveldb_open }, |
︙ | ︙ |
Changes to lsm-test/lsmtest_tdb3.c.
︙ | ︙ | |||
366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 | int iOpt = testPrngValue(iSeed++) % 3; switch( iOpt ){ case 0: break; case 1: testPrngArray(iSeed++, (u32 *)aOld, pDb->szSector/4); case 2: pEnv->xWrite( pFile, (lsm_i64)i * pDb->szSector, aOld, pDb->szSector ); break; } testFree(aOld); pDb->aFile[iFile].aSector[i].aOld = 0; } } pEnv->xClose(pFile); zFree = zFile = sqlite3_mprintf("%s-log", pDb->zName); } sqlite3_free(zFree); } /* ** End test VFS code. ************************************************************************** *************************************************************************/ static int test_lsm_close(TestDb *pTestDb){ int i; int rc = LSM_OK; LsmDb *pDb = (LsmDb *)pTestDb; | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 | int iOpt = testPrngValue(iSeed++) % 3; switch( iOpt ){ case 0: break; case 1: testPrngArray(iSeed++, (u32 *)aOld, pDb->szSector/4); /* Fall-through */ case 2: pEnv->xWrite( pFile, (lsm_i64)i * pDb->szSector, aOld, pDb->szSector ); break; } testFree(aOld); pDb->aFile[iFile].aSector[i].aOld = 0; } } pEnv->xClose(pFile); zFree = zFile = sqlite3_mprintf("%s-log", pDb->zName); } sqlite3_free(zFree); } /* ** End test VFS code. ************************************************************************** *************************************************************************/ /************************************************************************* ************************************************************************** ** Begin test compression hooks. */ #ifdef HAVE_ZLIB #include <zlib.h> static int testZipBound(void *pCtx, int nSrc){ return compressBound(nSrc); } static int testZipCompress( void *pCtx, /* Context pointer */ char *aOut, int *pnOut, /* OUT: Buffer containing compressed data */ const char *aIn, int nIn /* Buffer containing input data */ ){ uLongf n = *pnOut; /* In/out buffer size for compress() */ int rc; /* compress() return code */ rc = compress((Bytef*)aOut, &n, (Bytef*)aIn, nIn); *pnOut = n; return (rc==Z_OK ? 0 : LSM_ERROR); } static int testZipUncompress( void *pCtx, /* Context pointer */ char *aOut, int *pnOut, /* OUT: Buffer containing uncompressed data */ const char *aIn, int nIn /* Buffer containing input data */ ){ uLongf n = *pnOut; /* In/out buffer size for uncompress() */ int rc; /* uncompress() return code */ rc = uncompress((Bytef*)aOut, &n, (Bytef*)aIn, nIn); *pnOut = n; return (rc==Z_OK ? 0 : LSM_ERROR); } static int testConfigureCompression(lsm_db *pDb){ static lsm_compress zip = { 1, sizeof(lsm_compress), 0, /* Context pointer (unused) */ testZipBound, /* xBound method */ testZipCompress, /* xCompress method */ testZipUncompress /* xUncompress method */ }; return lsm_config(pDb, LSM_CONFIG_SET_COMPRESSION, &zip); } #endif /* ifdef HAVE_ZLIB */ /* ** End test compression hooks. ************************************************************************** *************************************************************************/ static int test_lsm_close(TestDb *pTestDb){ int i; int rc = LSM_OK; LsmDb *pDb = (LsmDb *)pTestDb; |
︙ | ︙ | |||
613 614 615 616 617 618 619 620 621 622 623 624 625 626 | static void xWorkHook(lsm_db *db, void *pArg){ LsmDb *p = (LsmDb *)pArg; if( p->xWork ) p->xWork(db, p->pWorkCtx); } #define TEST_NO_RECOVERY -1 #define TEST_THREADS -2 static int test_lsm_config_str( LsmDb *pLsm, lsm_db *db, int bWorker, const char *zStr, int *pnThread | > | 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 | static void xWorkHook(lsm_db *db, void *pArg){ LsmDb *p = (LsmDb *)pArg; if( p->xWork ) p->xWork(db, p->pWorkCtx); } #define TEST_NO_RECOVERY -1 #define TEST_THREADS -2 #define TEST_COMPRESSION -3 static int test_lsm_config_str( LsmDb *pLsm, lsm_db *db, int bWorker, const char *zStr, int *pnThread |
︙ | ︙ | |||
641 642 643 644 645 646 647 648 649 650 651 652 653 654 | { "use_log", 0, LSM_CONFIG_USE_LOG }, { "nmerge", 0, LSM_CONFIG_NMERGE }, { "max_freelist", 0, LSM_CONFIG_MAX_FREELIST }, { "multi_proc", 0, LSM_CONFIG_MULTIPLE_PROCESSES }, { "worker_nmerge", 1, LSM_CONFIG_NMERGE }, { "test_no_recovery", 0, TEST_NO_RECOVERY }, { "threads", 0, TEST_THREADS }, { 0, 0 } }; const char *z = zStr; int nThread = 1; assert( db ); while( z[0] ){ | > > > | 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 | { "use_log", 0, LSM_CONFIG_USE_LOG }, { "nmerge", 0, LSM_CONFIG_NMERGE }, { "max_freelist", 0, LSM_CONFIG_MAX_FREELIST }, { "multi_proc", 0, LSM_CONFIG_MULTIPLE_PROCESSES }, { "worker_nmerge", 1, LSM_CONFIG_NMERGE }, { "test_no_recovery", 0, TEST_NO_RECOVERY }, { "threads", 0, TEST_THREADS }, #ifdef HAVE_ZLIB { "compression", 0, TEST_COMPRESSION }, #endif { 0, 0 } }; const char *z = zStr; int nThread = 1; assert( db ); while( z[0] ){ |
︙ | ︙ | |||
692 693 694 695 696 697 698 699 700 701 702 703 704 705 | switch( eParam ){ case TEST_NO_RECOVERY: pLsm->bNoRecovery = iVal; break; case TEST_THREADS: nThread = iVal; break; } } } }else if( z!=zStart ){ goto syntax_error; } } | > > > > > | 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 | switch( eParam ){ case TEST_NO_RECOVERY: pLsm->bNoRecovery = iVal; break; case TEST_THREADS: nThread = iVal; break; #ifdef HAVE_ZLIB case TEST_COMPRESSION: testConfigureCompression(db); break; #endif } } } }else if( z!=zStart ){ goto syntax_error; } } |
︙ | ︙ | |||
828 829 830 831 832 833 834 | int test_lsm_lomem_open( const char *zFilename, int bClear, TestDb **ppDb ){ const char *zCfg = | | > > > > > > > > > > > > > > > > | 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 | int test_lsm_lomem_open( const char *zFilename, int bClear, TestDb **ppDb ){ const char *zCfg = "page_size=256 block_size=65536 write_buffer=16384 " "max_freelist=4 autocheckpoint=32768 " "mmap=0 " ; return testLsmOpen(zCfg, zFilename, bClear, ppDb); } int test_lsm_zip_open( const char *zFilename, int bClear, TestDb **ppDb ){ const char *zCfg = "page_size=256 block_size=65536 write_buffer=16384 " "max_freelist=4 autocheckpoint=32768 compression=1" "mmap=0 " ; return testLsmOpen(zCfg, zFilename, bClear, ppDb); } lsm_db *tdb_lsm(TestDb *pDb){ if( pDb->pMethods->xClose==test_lsm_close ){ return ((LsmDb *)pDb)->db; } |
︙ | ︙ |
Changes to src/kvlsm.c.
︙ | ︙ | |||
451 452 453 454 455 456 457 458 459 460 461 462 463 464 | memset(pNew, 0, sizeof(KVLsm)); pNew->base.pStoreVfunc = &kvlsmMethods; pNew->base.pEnv = pEnv; rc = lsm_new(0, &pNew->pDb); if( rc==SQLITE4_OK ){ int i; for(i=0; i<ArraySize(aConfig); i++){ const char *zVal = sqlite4_uri_parameter(zName, aConfig[i].zParam); if( zVal ){ int nVal = sqlite4Atoi(zVal); lsm_config(pNew->pDb, aConfig[i].eParam, &nVal); } } | > > | 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 | memset(pNew, 0, sizeof(KVLsm)); pNew->base.pStoreVfunc = &kvlsmMethods; pNew->base.pEnv = pEnv; rc = lsm_new(0, &pNew->pDb); if( rc==SQLITE4_OK ){ int i; int bMmap = 0; lsm_config(pNew->pDb, LSM_CONFIG_MMAP, &bMmap); for(i=0; i<ArraySize(aConfig); i++){ const char *zVal = sqlite4_uri_parameter(zName, aConfig[i].zParam); if( zVal ){ int nVal = sqlite4Atoi(zVal); lsm_config(pNew->pDb, aConfig[i].eParam, &nVal); } } |
︙ | ︙ |
Changes to src/lsm.h.
︙ | ︙ | |||
29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 | typedef struct lsm_file lsm_file; /* OS file handle */ /* 64-bit integer type used for file offsets. */ typedef long long int lsm_i64; /* 64-bit signed integer type */ /* Forward reference */ typedef struct lsm_env lsm_env; /* Runtime environment */ /* Candidate values for the 3rd argument to lsm_env.xLock() */ #define LSM_LOCK_UNLOCK 0 #define LSM_LOCK_SHARED 1 #define LSM_LOCK_EXCL 2 /* ** Run-time environment used by LSM */ struct lsm_env { int nByte; /* Size of this structure in bytes */ | > | | 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 | typedef struct lsm_file lsm_file; /* OS file handle */ /* 64-bit integer type used for file offsets. */ typedef long long int lsm_i64; /* 64-bit signed integer type */ /* Forward reference */ typedef struct lsm_env lsm_env; /* Runtime environment */ typedef struct lsm_compress lsm_compress; /* Compression library functions */ /* Candidate values for the 3rd argument to lsm_env.xLock() */ #define LSM_LOCK_UNLOCK 0 #define LSM_LOCK_SHARED 1 #define LSM_LOCK_EXCL 2 /* ** Run-time environment used by LSM */ struct lsm_env { int nByte; /* Size of this structure in bytes */ int iVersion; /* Version number of this structure (1) */ /****** file i/o ***********************************************/ void *pVfsCtx; int (*xFullpath)(lsm_env*, const char *, char *, int *); int (*xOpen)(lsm_env*, const char *, lsm_file **); int (*xRead)(lsm_file *, lsm_i64, void *, int); int (*xWrite)(lsm_file *, lsm_i64, void *, int); int (*xTruncate)(lsm_file *, lsm_i64); |
︙ | ︙ | |||
76 77 78 79 80 81 82 83 84 85 86 87 88 89 | void (*xMutexEnter)(lsm_mutex *); /* Grab a mutex */ int (*xMutexTry)(lsm_mutex *); /* Attempt to obtain a mutex */ void (*xMutexLeave)(lsm_mutex *); /* Leave a mutex */ int (*xMutexHeld)(lsm_mutex *); /* Return true if mutex is held */ int (*xMutexNotHeld)(lsm_mutex *); /* Return true if mutex not held */ /****** other ****************************************************/ int (*xSleep)(lsm_env*, int microseconds); /* New fields may be added in future releases, in which case the ** iVersion value will increase. */ }; /* ** Values that may be passed as the second argument to xMutexStatic. | > > > > > > > > > > > > > > > > > | 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 | void (*xMutexEnter)(lsm_mutex *); /* Grab a mutex */ int (*xMutexTry)(lsm_mutex *); /* Attempt to obtain a mutex */ void (*xMutexLeave)(lsm_mutex *); /* Leave a mutex */ int (*xMutexHeld)(lsm_mutex *); /* Return true if mutex is held */ int (*xMutexNotHeld)(lsm_mutex *); /* Return true if mutex not held */ /****** other ****************************************************/ int (*xSleep)(lsm_env*, int microseconds); /* New fields may be added in future releases, in which case the ** iVersion value will increase. */ }; /* ** The compression library interface. */ struct lsm_compress { int nByte; /* Size of this structure in bytes */ int iVersion; /* Version number of this structure (1) */ /* Compression library functions */ void *pCtx; int (*xBound)(void *, int nSrc); int (*xCompress)(void *, char *, int *, const char *, int); int (*xUncompress)(void *, char *, int *, const char *, int); /* New fields may be added in future releases, in which case the ** iVersion value will increase. */ }; /* ** Values that may be passed as the second argument to xMutexStatic. |
︙ | ︙ | |||
188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 | ** stored elsewhere in the database). ** ** There is no reason for an application to configure or query this ** parameter. It is only present because configuring a small value ** makes certain parts of the lsm code easier to test. ** ** LSM_CONFIG_MULTIPLE_PROCESSES */ #define LSM_CONFIG_WRITE_BUFFER 1 #define LSM_CONFIG_PAGE_SIZE 2 #define LSM_CONFIG_SAFETY 3 #define LSM_CONFIG_BLOCK_SIZE 4 #define LSM_CONFIG_AUTOWORK 5 #define LSM_CONFIG_LOG_SIZE 6 #define LSM_CONFIG_MMAP 7 #define LSM_CONFIG_USE_LOG 8 #define LSM_CONFIG_NMERGE 9 #define LSM_CONFIG_MAX_FREELIST 10 #define LSM_CONFIG_MULTIPLE_PROCESSES 11 #define LSM_CONFIG_AUTOCHECKPOINT 12 #define LSM_SAFETY_OFF 0 #define LSM_SAFETY_NORMAL 1 #define LSM_SAFETY_FULL 2 /* | > > > > > > > > > > > > > > > > > > > > > | 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 | ** stored elsewhere in the database). ** ** There is no reason for an application to configure or query this ** parameter. It is only present because configuring a small value ** makes certain parts of the lsm code easier to test. ** ** LSM_CONFIG_MULTIPLE_PROCESSES ** A read/write boolean parameter. This parameter may only be set before ** lsm_open() has been called. If true, the library uses shared-memory ** and posix advisory locks to co-ordinate access by clients from within ** multiple processes. Otherwise, if false, all database clients must be ** located in the same process. The default value is true. ** ** LSM_CONFIG_SET_COMPRESSION ** Set the compression methods used to compress and decompress database ** content. The argument to this option should be a pointer to a structure ** of type lsm_compress. The lsm_config() method takes a copy of the ** structures contents. ** ** This option may only be used before lsm_open() is called. Invoking it ** after lsm_open() has been called results in an LSM_MISUSE error. ** ** LSM_CONFIG_GET_COMPRESSION ** Query the compression methods used to compress and decompress database ** content. */ #define LSM_CONFIG_WRITE_BUFFER 1 #define LSM_CONFIG_PAGE_SIZE 2 #define LSM_CONFIG_SAFETY 3 #define LSM_CONFIG_BLOCK_SIZE 4 #define LSM_CONFIG_AUTOWORK 5 #define LSM_CONFIG_LOG_SIZE 6 #define LSM_CONFIG_MMAP 7 #define LSM_CONFIG_USE_LOG 8 #define LSM_CONFIG_NMERGE 9 #define LSM_CONFIG_MAX_FREELIST 10 #define LSM_CONFIG_MULTIPLE_PROCESSES 11 #define LSM_CONFIG_AUTOCHECKPOINT 12 #define LSM_CONFIG_SET_COMPRESSION 13 #define LSM_CONFIG_GET_COMPRESSION 14 #define LSM_SAFETY_OFF 0 #define LSM_SAFETY_NORMAL 1 #define LSM_SAFETY_FULL 2 /* |
︙ | ︙ |
Changes to src/lsmInt.h.
︙ | ︙ | |||
99 100 101 102 103 104 105 | typedef unsigned char u8; typedef unsigned short int u16; typedef unsigned int u32; typedef lsm_i64 i64; typedef unsigned long long int u64; | | | | 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 | typedef unsigned char u8; typedef unsigned short int u16; typedef unsigned int u32; typedef lsm_i64 i64; typedef unsigned long long int u64; /* A page number is a 64-bit integer. */ typedef i64 Pgno; #ifdef LSM_DEBUG int lsmErrorBkpt(int); #else # define lsmErrorBkpt(x) (x) #endif |
︙ | ︙ | |||
139 140 141 142 143 144 145 | #define LSM_LOCK_READER(i) ((i) + LSM_LOCK_CHECKPOINTER + 1) /* ** Hard limit on the number of free-list entries that may be stored in ** a checkpoint (the remainder are stored as a system record in the LSM). ** See also LSM_CONFIG_MAX_FREELIST. */ | | | 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 | #define LSM_LOCK_READER(i) ((i) + LSM_LOCK_CHECKPOINTER + 1) /* ** Hard limit on the number of free-list entries that may be stored in ** a checkpoint (the remainder are stored as a system record in the LSM). ** See also LSM_CONFIG_MAX_FREELIST. */ #define LSM_MAX_FREELIST_ENTRIES 24 #define LSM_ATTEMPTS_BEFORE_PROTOCOL 10000 /* ** Each entry stored in the LSM (or in-memory tree structure) has an ** associated mask of the following flags. |
︙ | ︙ | |||
300 301 302 303 304 305 306 307 308 309 310 311 312 313 | int bUseLog; /* Configured by LSM_CONFIG_USE_LOG */ int nDfltPgsz; /* Configured by LSM_CONFIG_PAGE_SIZE */ int nDfltBlksz; /* Configured by LSM_CONFIG_BLOCK_SIZE */ int nMaxFreelist; /* Configured by LSM_CONFIG_MAX_FREELIST */ int bMmap; /* Configured by LSM_CONFIG_MMAP */ int nAutockpt; /* Configured by LSM_CONFIG_AUTOCHECKPOINT */ int bMultiProc; /* Configured by L_C_MULTIPLE_PROCESSES */ /* Sub-system handles */ FileSystem *pFS; /* On-disk portion of database */ Database *pDatabase; /* Database shared data */ /* Client transaction context */ Snapshot *pClient; /* Client snapshot (non-NULL in read trans) */ | > | 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 | int bUseLog; /* Configured by LSM_CONFIG_USE_LOG */ int nDfltPgsz; /* Configured by LSM_CONFIG_PAGE_SIZE */ int nDfltBlksz; /* Configured by LSM_CONFIG_BLOCK_SIZE */ int nMaxFreelist; /* Configured by LSM_CONFIG_MAX_FREELIST */ int bMmap; /* Configured by LSM_CONFIG_MMAP */ int nAutockpt; /* Configured by LSM_CONFIG_AUTOCHECKPOINT */ int bMultiProc; /* Configured by L_C_MULTIPLE_PROCESSES */ lsm_compress compress; /* Compression callbacks */ /* Sub-system handles */ FileSystem *pFS; /* On-disk portion of database */ Database *pDatabase; /* Database shared data */ /* Client transaction context */ Snapshot *pClient; /* Client snapshot (non-NULL in read trans) */ |
︙ | ︙ | |||
338 339 340 341 342 343 344 | ShmHeader *pShmhdr; /* Live shared-memory header */ TreeHeader treehdr; /* Local copy of tree-header */ u32 aSnapshot[LSM_META_PAGE_SIZE / sizeof(u32)]; }; struct Segment { Pgno iFirst; /* First page of this run */ | | | | | 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 | ShmHeader *pShmhdr; /* Live shared-memory header */ TreeHeader treehdr; /* Local copy of tree-header */ u32 aSnapshot[LSM_META_PAGE_SIZE / sizeof(u32)]; }; struct Segment { Pgno iFirst; /* First page of this run */ Pgno iLastPg; /* Last page of this run */ Pgno iRoot; /* Root page number (if any) */ int nSize; /* Size of this run in pages */ }; /* ** iSplitTopic/pSplitKey/nSplitKey: ** If nRight>0, this buffer contains a copy of the largest key that has ** already been written to the left-hand-side of the level. */ |
︙ | ︙ | |||
369 370 371 372 373 374 375 | ** A structure describing an ongoing merge. There is an instance of this ** structure for every Level currently undergoing a merge in the worker ** snapshot. ** ** It is assumed that code that uses an instance of this structure has ** access to the associated Level struct. ** | < < < < | 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 | ** A structure describing an ongoing merge. There is an instance of this ** structure for every Level currently undergoing a merge in the worker ** snapshot. ** ** It is assumed that code that uses an instance of this structure has ** access to the associated Level struct. ** ** iOutputOff: ** The byte offset to write to next within the last page of the ** output segment. */ struct MergeInput { Pgno iPg; /* Page on which next input is stored */ int iCell; /* Cell containing next input to merge */ }; struct Merge { int nInput; /* Number of input runs being merged */ MergeInput *aInput; /* Array nInput entries in size */ MergeInput splitkey; /* Location in file of current splitkey */ int nSkip; /* Number of separators entries to skip */ int iOutputOff; /* Write offset on output page */ Pgno iCurrentPtr; /* Current pointer value */ }; /* ** The first argument to this macro is a pointer to a Segment structure. ** Returns true if the structure instance indicates that the separators ** array is valid. */ |
︙ | ︙ | |||
481 482 483 484 485 486 487 | Database *pDatabase; /* Database this snapshot belongs to */ Level *pLevel; /* Pointer to level 0 of snapshot (or NULL) */ i64 iId; /* Snapshot id */ i64 iLogOff; /* Log file offset */ /* Used by worker snapshots only */ int nBlock; /* Number of blocks in database file */ | | | 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 | Database *pDatabase; /* Database this snapshot belongs to */ Level *pLevel; /* Pointer to level 0 of snapshot (or NULL) */ i64 iId; /* Snapshot id */ i64 iLogOff; /* Log file offset */ /* Used by worker snapshots only */ int nBlock; /* Number of blocks in database file */ Pgno aiAppend[LSM_APPLIST_SZ]; /* Append point list */ Freelist freelist; /* Free block list */ int nFreelistOvfl; /* Number of extra free-list entries in LSM */ u32 nWrite; /* Total number of pages written to disk */ }; #define LSM_INITIAL_SNAPSHOT_ID 11 /* |
︙ | ︙ | |||
618 619 620 621 622 623 624 | int lsmFsFileid(lsm_db *pDb, void **ppId, int *pnId); /* Creating, populating, gobbling and deleting sorted runs. */ void lsmFsGobble(lsm_db *, Segment *, Pgno *, int); int lsmFsSortedDelete(FileSystem *, Snapshot *, int, Segment *); int lsmFsSortedFinish(FileSystem *, Segment *); int lsmFsSortedAppend(FileSystem *, Snapshot *, Segment *, Page **); | | > < > | 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 | int lsmFsFileid(lsm_db *pDb, void **ppId, int *pnId); /* Creating, populating, gobbling and deleting sorted runs. */ void lsmFsGobble(lsm_db *, Segment *, Pgno *, int); int lsmFsSortedDelete(FileSystem *, Snapshot *, int, Segment *); int lsmFsSortedFinish(FileSystem *, Segment *); int lsmFsSortedAppend(FileSystem *, Snapshot *, Segment *, Page **); int lsmFsSortedPadding(FileSystem *, Snapshot *, Segment *); /* Functions to retrieve the lsm_env pointer from a FileSystem or Page object */ lsm_env *lsmFsEnv(FileSystem *); lsm_env *lsmPageEnv(Page *); FileSystem *lsmPageFS(Page *); int lsmFsSectorSize(FileSystem *); void lsmSortedSplitkey(lsm_db *, Level *, int *); /* Reading sorted run content. */ int lsmFsDbPageLast(FileSystem *pFS, Segment *pSeg, Page **ppPg); int lsmFsDbPageGet(FileSystem *, Pgno, Page **); int lsmFsDbPageNext(Segment *, Page *, int eDir, Page **); u8 *lsmFsPageData(Page *, int *); int lsmFsPageRelease(Page *); int lsmFsPagePersist(Page *); void lsmFsPageRef(Page *); Pgno lsmFsPageNumber(Page *); int lsmFsNRead(FileSystem *); int lsmFsNWrite(FileSystem *); int lsmFsMetaPageGet(FileSystem *, int, int, MetaPage **); int lsmFsMetaPageRelease(MetaPage *); u8 *lsmFsMetaPageData(MetaPage *, int *); #ifdef LSM_DEBUG int lsmFsDbPageIsLast(Segment *pSeg, Page *pPg); int lsmFsIntegrityCheck(lsm_db *); #endif int lsmFsPageWritable(Page *); /* Functions to read, write and sync the log file. */ int lsmFsWriteLog(FileSystem *pFS, i64 iOff, LsmString *pStr); |
︙ | ︙ | |||
696 697 698 699 700 701 702 | int lsmFlushTreeToDisk(lsm_db *pDb); void lsmSortedRemap(lsm_db *pDb); void lsmSortedFreeLevel(lsm_env *pEnv, Level *); | < | 694 695 696 697 698 699 700 701 702 703 704 705 706 707 | int lsmFlushTreeToDisk(lsm_db *pDb); void lsmSortedRemap(lsm_db *pDb); void lsmSortedFreeLevel(lsm_env *pEnv, Level *); int lsmSortedAdvanceAll(lsm_db *pDb); int lsmSortedLoadMerge(lsm_db *, Level *, u32 *, int *); int lsmSortedLoadFreelist(lsm_db *pDb, void **, int *); void *lsmSortedSplitKey(Level *pLevel, int *pnByte); |
︙ | ︙ |
Changes to src/lsm_ckpt.c.
︙ | ︙ | |||
51 52 53 54 55 56 57 | ** ** 4 integers. See ckptExportAppendlist(). ** ** For each level in the database, a level record. Formatted as follows: ** ** 0. Age of the level. ** 1. The number of right-hand segments (nRight, possibly 0), | | | | > | | | > | 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 | ** ** 4 integers. See ckptExportAppendlist(). ** ** For each level in the database, a level record. Formatted as follows: ** ** 0. Age of the level. ** 1. The number of right-hand segments (nRight, possibly 0), ** 2. Segment record for left-hand segment (8 integers defined below), ** 3. Segment record for each right-hand segment (8 integers defined below), ** 4. If nRight>0, The number of segments involved in the merge ** 5. if nRight>0, Current nSkip value (see Merge structure defn.), ** 6. For each segment in the merge: ** 5a. Page number of next cell to read during merge (this field ** is 64-bits - 2 integers) ** 5b. Cell number of next cell to read during merge ** 7. Page containing current split-key (64-bits - 2 integers). ** 8. Cell within page containing current split-key. ** 9. Current pointer value (64-bits - 2 integers). ** ** The freelist. ** ** 1. Number of free-list entries stored in checkpoint header. ** 2. For each entry: ** 2a. Block number of free block. ** 2b. MSW of associated checkpoint id. ** 2c. LSW of associated checkpoint id. ** ** If the overflow flag is set, then extra free-list entries may be stored ** in the FREELIST record. The FREELIST record contains 3 32-bit integers ** per entry, in the same format as above (without the "number of entries" ** field). ** ** The checksum: ** ** 1. Checksum value 1. ** 2. Checksum value 2. ** ** In the above, a segment record consists of the following four 64-bit ** fields (converted to 2 * u32 by storing the MSW followed by LSW): ** ** 1. First page of array, ** 2. Last page of array, ** 3. Root page of array (or 0), ** 4. Size of array in pages. */ |
︙ | ︙ | |||
103 104 105 106 107 108 109 | ** follows: ** ** * For each level in the database not undergoing a merge, add 1. ** ** * For each level in the database that is undergoing a merge, add ** the number of segments on the rhs of the level. ** | | | | | > > > > > > | 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 | ** follows: ** ** * For each level in the database not undergoing a merge, add 1. ** ** * For each level in the database that is undergoing a merge, add ** the number of segments on the rhs of the level. ** ** A level record not undergoing a merge is 10 integers. A level record ** with nRhs rhs segments and (nRhs+1) input segments (i.e. including the ** separators from the next level) is (11*nRhs+20) integers. The maximum ** per right-hand-side level is therefore 21 integers. So the maximum ** size of all level records in a checkpoint is 21*40=820 integers. ** ** TODO: Before pointer values were changed from 32 to 64 bits, the above ** used to come to 420 bytes - leaving significant space for a free-list ** prefix. No more. To fix this, reduce the size of the level records in ** a db snapshot, and improve management of the free-list tail in ** lsm_sorted.c. */ #define LSM_MAX_RHS_SEGMENTS 40 /* ** LARGE NUMBERS OF FREELIST ENTRIES: ** ** There is also a limit (LSM_MAX_FREELIST_ENTRIES - defined in lsmInt.h) |
︙ | ︙ | |||
157 158 159 160 161 162 163 | static const int one = 1; #define LSM_LITTLE_ENDIAN (*(u8 *)(&one)) /* Sizes, in integers, of various parts of the checkpoint. */ #define CKPT_HDR_SIZE 9 #define CKPT_LOGPTR_SIZE 4 | < < | | 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 | static const int one = 1; #define LSM_LITTLE_ENDIAN (*(u8 *)(&one)) /* Sizes, in integers, of various parts of the checkpoint. */ #define CKPT_HDR_SIZE 9 #define CKPT_LOGPTR_SIZE 4 #define CKPT_APPENDLIST_SIZE (LSM_APPLIST_SZ * 2) /* A #define to describe each integer in the checkpoint header. */ #define CKPT_HDR_ID_MSW 0 #define CKPT_HDR_ID_LSW 1 #define CKPT_HDR_NCKPT 2 #define CKPT_HDR_NBLOCK 3 #define CKPT_HDR_BLKSZ 4 |
︙ | ︙ | |||
257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 | if( *pRc==LSM_OK ){ u32 aCksum[2] = {0, 0}; ckptChecksum(p->aCkpt, nCkpt+2, &aCksum[0], &aCksum[1]); ckptSetValue(p, nCkpt, aCksum[0], pRc); ckptSetValue(p, nCkpt+1, aCksum[1], pRc); } } /* ** Append a 6-value segment record corresponding to pSeg to the checkpoint ** buffer passed as the third argument. */ static void ckptExportSegment( Segment *pSeg, CkptBuffer *p, int *piOut, int *pRc ){ | > > > > > > > > > > > > > > > > > > < < | | | | < < | 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 | if( *pRc==LSM_OK ){ u32 aCksum[2] = {0, 0}; ckptChecksum(p->aCkpt, nCkpt+2, &aCksum[0], &aCksum[1]); ckptSetValue(p, nCkpt, aCksum[0], pRc); ckptSetValue(p, nCkpt+1, aCksum[1], pRc); } } static void ckptAppend64(CkptBuffer *p, int *piOut, i64 iVal, int *pRc){ int iOut = *piOut; ckptSetValue(p, iOut++, (iVal >> 32) & 0xFFFFFFFF, pRc); ckptSetValue(p, iOut++, (iVal & 0xFFFFFFFF), pRc); *piOut = iOut; } static i64 ckptRead64(u32 *a){ return (((i64)a[0]) << 32) + (i64)a[1]; } static i64 ckptGobble64(u32 *a, int *piIn){ int iIn = *piIn; *piIn += 2; return ckptRead64(&a[iIn]); } /* ** Append a 6-value segment record corresponding to pSeg to the checkpoint ** buffer passed as the third argument. */ static void ckptExportSegment( Segment *pSeg, CkptBuffer *p, int *piOut, int *pRc ){ ckptAppend64(p, piOut, pSeg->iFirst, pRc); ckptAppend64(p, piOut, pSeg->iLastPg, pRc); ckptAppend64(p, piOut, pSeg->iRoot, pRc); ckptAppend64(p, piOut, pSeg->nSize, pRc); } static void ckptExportLevel( Level *pLevel, /* Level object to serialize */ CkptBuffer *p, /* Append new level record to this ckpt */ int *piOut, /* IN/OUT: Size of checkpoint so far */ int *pRc /* IN/OUT: Error code */ |
︙ | ︙ | |||
304 305 306 307 308 309 310 | } assert( pMerge->nInput==pLevel->nRight || pMerge->nInput==pLevel->nRight+1 ); ckptSetValue(p, iOut++, pMerge->nInput, pRc); ckptSetValue(p, iOut++, pMerge->nSkip, pRc); for(i=0; i<pMerge->nInput; i++){ | | | | | 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 | } assert( pMerge->nInput==pLevel->nRight || pMerge->nInput==pLevel->nRight+1 ); ckptSetValue(p, iOut++, pMerge->nInput, pRc); ckptSetValue(p, iOut++, pMerge->nSkip, pRc); for(i=0; i<pMerge->nInput; i++){ ckptAppend64(p, &iOut, pMerge->aInput[i].iPg, pRc); ckptSetValue(p, iOut++, pMerge->aInput[i].iCell, pRc); } ckptAppend64(p, &iOut, pMerge->splitkey.iPg, pRc); ckptSetValue(p, iOut++, pMerge->splitkey.iCell, pRc); ckptAppend64(p, &iOut, pMerge->iCurrentPtr, pRc); } *piOut = iOut; } /* ** Populate the log offset fields of the checkpoint buffer. 4 values. |
︙ | ︙ | |||
331 332 333 334 335 336 337 | ){ int iOut = *piOut; assert( iOut==CKPT_HDR_LO_MSW ); if( bFlush ){ i64 iOff = pDb->treehdr.iOldLog; | < | < | | | < | 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 | ){ int iOut = *piOut; assert( iOut==CKPT_HDR_LO_MSW ); if( bFlush ){ i64 iOff = pDb->treehdr.iOldLog; ckptAppend64(p, &iOut, iOff, pRc); ckptSetValue(p, iOut++, pDb->treehdr.oldcksum0, pRc); ckptSetValue(p, iOut++, pDb->treehdr.oldcksum1, pRc); }else{ for(; iOut<=CKPT_HDR_LO_CKSUM2; iOut++){ ckptSetValue(p, iOut, pDb->pShmhdr->aSnap2[iOut], pRc); } } *piOut = iOut; } static void ckptExportAppendlist( lsm_db *db, /* Database connection */ CkptBuffer *p, /* Checkpoint buffer to write to */ int *piOut, /* IN/OUT: Offset within checkpoint buffer */ int *pRc /* IN/OUT: Error code */ ){ int i; Pgno *aiAppend = db->pWorker->aiAppend; for(i=0; i<LSM_APPLIST_SZ; i++){ ckptAppend64(p, piOut, aiAppend[i], pRc); } }; static int ckptExportSnapshot( lsm_db *pDb, /* Connection handle */ int nOvfl, /* Number of free-list entries in LSM */ int bLog, /* True to update log-offset fields */ i64 iId, /* Checkpoint id */ |
︙ | ︙ | |||
461 462 463 464 465 466 467 | ** Helper function for ckptImport(). */ static void ckptNewSegment( u32 *aIn, int *piIn, Segment *pSegment /* Populate this structure */ ){ | < < | | | | | | < < | | | | 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 | ** Helper function for ckptImport(). */ static void ckptNewSegment( u32 *aIn, int *piIn, Segment *pSegment /* Populate this structure */ ){ assert( pSegment->iFirst==0 && pSegment->iLastPg==0 ); assert( pSegment->nSize==0 && pSegment->iRoot==0 ); pSegment->iFirst = ckptGobble64(aIn, piIn); pSegment->iLastPg = ckptGobble64(aIn, piIn); pSegment->iRoot = ckptGobble64(aIn, piIn); pSegment->nSize = ckptGobble64(aIn, piIn); assert( pSegment->iFirst ); } static int ckptSetupMerge(lsm_db *pDb, u32 *aInt, int *piIn, Level *pLevel){ Merge *pMerge; /* Allocated Merge object */ int nInput; /* Number of input segments in merge */ int iIn = *piIn; /* Next value to read from aInt[] */ int i; /* Iterator variable */ int nByte; /* Number of bytes to allocate */ /* Allocate the Merge object. If malloc() fails, return LSM_NOMEM. */ nInput = (int)aInt[iIn++]; nByte = sizeof(Merge) + sizeof(MergeInput) * nInput; pMerge = (Merge *)lsmMallocZero(pDb->pEnv, nByte); if( !pMerge ) return LSM_NOMEM_BKPT; pLevel->pMerge = pMerge; /* Populate the Merge object. */ pMerge->aInput = (MergeInput *)&pMerge[1]; pMerge->nInput = nInput; pMerge->iOutputOff = -1; pMerge->nSkip = (int)aInt[iIn++]; for(i=0; i<nInput; i++){ pMerge->aInput[i].iPg = ckptGobble64(aInt, &iIn); pMerge->aInput[i].iCell = (int)aInt[iIn++]; } pMerge->splitkey.iPg = ckptGobble64(aInt, &iIn); pMerge->splitkey.iCell = (int)aInt[iIn++]; pMerge->iCurrentPtr = ckptGobble64(aInt, &iIn); /* Set *piIn and return LSM_OK. */ *piIn = iIn; return LSM_OK; } |
︙ | ︙ | |||
885 886 887 888 889 890 891 | 0, /* CKPT_HDR_NBLOCK */ 0, /* CKPT_HDR_BLKSZ */ 0, /* CKPT_HDR_NLEVEL */ 0, /* CKPT_HDR_PGSZ */ 0, /* CKPT_HDR_OVFL */ 0, /* CKPT_HDR_NWRITE */ 0, 0, 1234, 5678, /* The log pointer and initial checksum */ | | | 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 | 0, /* CKPT_HDR_NBLOCK */ 0, /* CKPT_HDR_BLKSZ */ 0, /* CKPT_HDR_NLEVEL */ 0, /* CKPT_HDR_PGSZ */ 0, /* CKPT_HDR_OVFL */ 0, /* CKPT_HDR_NWRITE */ 0, 0, 1234, 5678, /* The log pointer and initial checksum */ 0,0,0,0, 0,0,0,0, /* The append list */ 0, /* The free block list */ 0, 0 /* Space for checksum values */ }; u32 nCkpt = array_size(aCkpt); ShmHeader *pShm = pDb->pShmhdr; aCkpt[CKPT_HDR_NCKPT] = nCkpt; |
︙ | ︙ | |||
1047 1048 1049 1050 1051 1052 1053 | ){ int rc = LSM_OK; Snapshot *pNew; pNew = (Snapshot *)lsmMallocZeroRc(pDb->pEnv, sizeof(Snapshot), &rc); if( rc==LSM_OK ){ int nFree; | | | | > > | 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 | ){ int rc = LSM_OK; Snapshot *pNew; pNew = (Snapshot *)lsmMallocZeroRc(pDb->pEnv, sizeof(Snapshot), &rc); if( rc==LSM_OK ){ int nFree; int i; int nLevel = (int)aCkpt[CKPT_HDR_NLEVEL]; int iIn = CKPT_HDR_SIZE + CKPT_APPENDLIST_SIZE + CKPT_LOGPTR_SIZE; pNew->iId = lsmCheckpointId(aCkpt, 0); pNew->nBlock = aCkpt[CKPT_HDR_NBLOCK]; pNew->nWrite = aCkpt[CKPT_HDR_NWRITE]; rc = ckptLoadLevels(pDb, aCkpt, &iIn, nLevel, &pNew->pLevel); pNew->iLogOff = lsmCheckpointLogOffset(aCkpt); /* Make a copy of the append-list */ for(i=0; i<LSM_APPLIST_SZ; i++){ u32 *a = &aCkpt[CKPT_HDR_SIZE + CKPT_LOGPTR_SIZE + i*2]; pNew->aiAppend[i] = ckptRead64(a); } /* Copy the free-list */ if( bInclFreelist ){ pNew->nFreelistOvfl = aCkpt[CKPT_HDR_OVFL]; nFree = aCkpt[iIn++]; if( nFree ){ pNew->freelist.aEntry = (FreelistEntry *)lsmMallocZeroRc( |
︙ | ︙ |
Changes to src/lsm_file.c.
1 2 3 4 5 6 7 8 9 10 11 12 | /* ** 2011-08-26 ** ** The author disclaims copyright to this source code. In place of ** a legal notice, here is a blessing: ** ** May you do good and not evil. ** May you find forgiveness for yourself and forgive others. ** May you share freely, never taking more than you give. ** ************************************************************************* ** | | | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 | /* ** 2011-08-26 ** ** The author disclaims copyright to this source code. In place of ** a legal notice, here is a blessing: ** ** May you do good and not evil. ** May you find forgiveness for yourself and forgive others. ** May you share freely, never taking more than you give. ** ************************************************************************* ** ** NORMAL DATABASE FILE FORMAT ** ** The following database file format concepts are used by the code in ** this file to read and write the database file. ** ** Pages: ** ** A database file is divided into pages. The first 8KB of the file consists |
︙ | ︙ | |||
81 82 83 84 85 86 87 88 89 90 91 92 93 | ** lsmFsOpenLog ** lsmFsWriteLog ** lsmFsSyncLog ** lsmFsReadLog ** lsmFsTruncateLog ** lsmFsCloseAndDeleteLog ** */ #include "lsmInt.h" #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > < | 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 | ** lsmFsOpenLog ** lsmFsWriteLog ** lsmFsSyncLog ** lsmFsReadLog ** lsmFsTruncateLog ** lsmFsCloseAndDeleteLog ** ** COMPRESSED DATABASE FILE FORMAT ** ** The compressed database file format is very similar to the normal format. ** The file still begins with two 4KB meta-pages (which are never compressed). ** It is still divided into blocks. ** ** The first and last four bytes of each block are reserved for 32-bit ** pointer values. Similar to the way four bytes are carved from the end of ** the first and last page of each block in uncompressed databases. From ** the point of view of the upper layer, all pages are the same size - this ** is different from the uncompressed format where the first and last pages ** on each block are 4 bytes smaller than the others. ** ** Pages are stored in variable length compressed form, as follows: ** ** * 3-byte size field containing the size of the compressed page image ** in bytes. The most significant bit of each byte of the size field ** is always set. The remaining 7 bits are used to store a 21-bit ** integer value (in big-endian order - the first byte in the field ** contains the most significant 7 bits). Since the maximum allowed ** size of a compressed page image is (2^17 - 1) bytes, there are ** actually 4 unused bits in the size field. ** ** In other words, if the size of the compressed page image is nSz, ** the header can be serialized as follows: ** ** u8 aHdr[3] ** aHdr[0] = 0x80 | (u8)(nSz >> 14); ** aHdr[1] = 0x80 | (u8)(nSz >> 7); ** aHdr[2] = 0x80 | (u8)(nSz >> 0); ** ** * Compressed page image. ** ** * A second copy of the 3-byte record header. ** ** A page number is a byte offset into the database file. So the smallest ** possible page number is 8192 (immediately after the two meta-pages). ** The first and root page of a segment are identified by a page number ** corresponding to the byte offset of the first byte in the corresponding ** page record. The last page of a segment is identified by the byte offset ** of the last byte in its record. ** ** Unlike uncompressed pages, compressed page records may span blocks. ** ** Sometimes, in order to avoid touching sectors that contain synced data ** when writing, it is necessary to insert unused space between compressed ** page records. This can be done as follows: ** ** * For less than 6 bytes of empty space, the first and last byte ** of the free space contain the total number of free bytes. For ** example: ** ** Block of 4 free bytes: 0x04 0x?? 0x?? 0x04 ** Block of 2 free bytes: 0x02 0x02 ** A single free byte: 0x01 ** ** * For 6 or more bytes of empty space, a record similar to a ** compressed page record is added to the segment. A padding record ** is distinguished from a compressed page record by the most ** significant bit of the second byte of the size field, which is ** cleared instead of set. */ #include "lsmInt.h" #include <sys/types.h> #include <sys/stat.h> #include <fcntl.h> /* ** File-system object. Each database connection allocates a single instance ** of the following structure. It is used for all access to the database and ** log files. ** ** pLruFirst, pLruLast: |
︙ | ︙ | |||
120 121 122 123 124 125 126 127 128 129 130 131 132 133 | int nPagesize; /* Database page-size in bytes */ int nBlocksize; /* Database block-size in bytes */ /* r/w file descriptors for both files. */ LsmFile *pLsmFile; lsm_file *fdDb; /* Database file */ lsm_file *fdLog; /* Log file */ /* mmap() mode things */ int bUseMmap; /* True to use mmap() to access db file */ void *pMap; /* Current mapping of database file */ i64 nMap; /* Bytes mapped at pMap */ Page *pFree; | > > > > > > > | 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 | int nPagesize; /* Database page-size in bytes */ int nBlocksize; /* Database block-size in bytes */ /* r/w file descriptors for both files. */ LsmFile *pLsmFile; lsm_file *fdDb; /* Database file */ lsm_file *fdLog; /* Log file */ int szSector; /* Database file sector size */ /* If this is a compressed database, a pointer to the compression methods. ** For an uncompressed database, a NULL pointer. */ lsm_compress *pCompress; u8 *aBuffer; /* Buffer to compress into */ int nBuffer; /* Allocated size of aBuffer[] in bytes */ /* mmap() mode things */ int bUseMmap; /* True to use mmap() to access db file */ void *pMap; /* Current mapping of database file */ i64 nMap; /* Bytes mapped at pMap */ Page *pFree; |
︙ | ︙ | |||
143 144 145 146 147 148 149 150 151 152 153 | Page *pLruLast; /* Tail of the LRU list */ int nHash; /* Number of hash slots in hash table */ Page **apHash; /* nHash Hash slots */ }; /* ** Database page handle. */ struct Page { u8 *aData; /* Buffer containing page data */ int nData; /* Bytes of usable data at aData[] */ | > > > > > > > > > > | > > > > > | 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 | Page *pLruLast; /* Tail of the LRU list */ int nHash; /* Number of hash slots in hash table */ Page **apHash; /* nHash Hash slots */ }; /* ** Database page handle. ** ** pSeg: ** When lsmFsSortedAppend() is called on a compressed database, the new ** page is not assigned a page number or location in the database file ** immediately. Instead, these are assigned by the lsmFsPagePersist() call ** right before it writes the compressed page image to disk. ** ** The lsmFsSortedAppend() function sets the pSeg pointer to point to the ** segment that the new page will be a part of. It is unset by ** lsmFsPagePersist() after the page is written to disk. */ struct Page { u8 *aData; /* Buffer containing page data */ int nData; /* Bytes of usable data at aData[] */ Pgno iPg; /* Page number */ int nRef; /* Number of outstanding references */ int flags; /* Combination of PAGE_XXX flags */ Page *pHashNext; /* Next page in hash table slot */ Page *pLruNext; /* Next page in LRU list */ Page *pLruPrev; /* Previous page in LRU list */ FileSystem *pFS; /* File system that owns this page */ /* Only used in compressed database mode: */ int nCompress; /* Compressed size (or 0 for uncomp. db) */ int nCompressPrev; /* Compressed size of prev page */ Segment *pSeg; /* Segment this page will be written to */ }; /* ** Meta-data page handle. There are two meta-data pages at the start of ** the database file, each FileSystem.nMetasize bytes in size. */ struct MetaPage { |
︙ | ︙ | |||
180 181 182 183 184 185 186 187 188 189 190 191 192 193 | /* ** Number of pgsz byte pages omitted from the start of block 1. The start ** of block 1 contains two 4096 byte meta pages (8192 bytes in total). */ #define BLOCK1_HDR_SIZE(pgsz) LSM_MAX(1, 8192/(pgsz)) /* ** Wrappers around the VFS methods of the lsm_env object: ** ** lsmEnvOpen() ** lsmEnvRead() ** lsmEnvWrite() | > > > > > > > > > > > > > > > > | 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 | /* ** Number of pgsz byte pages omitted from the start of block 1. The start ** of block 1 contains two 4096 byte meta pages (8192 bytes in total). */ #define BLOCK1_HDR_SIZE(pgsz) LSM_MAX(1, 8192/(pgsz)) /* ** If NDEBUG is not defined, set a breakpoint in function lsmIoerrBkpt() ** to catch IO errors. */ #ifndef NDEBUG static void lsmIoerrBkpt(){ static int nErr = 0; nErr++; } static int IOERR_WRAPPER(int rc){ if( rc!=LSM_OK ) lsmIoerrBkpt(); return rc; } #else # define IOERR_WRAPPER(rc) (rc) #endif /* ** Wrappers around the VFS methods of the lsm_env object: ** ** lsmEnvOpen() ** lsmEnvRead() ** lsmEnvWrite() |
︙ | ︙ | |||
204 205 206 207 208 209 210 | static int lsmEnvRead( lsm_env *pEnv, lsm_file *pFile, lsm_i64 iOff, void *pRead, int nRead ){ | | | | | | | | | 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 | static int lsmEnvRead( lsm_env *pEnv, lsm_file *pFile, lsm_i64 iOff, void *pRead, int nRead ){ return IOERR_WRAPPER( pEnv->xRead(pFile, iOff, pRead, nRead) ); } static int lsmEnvWrite( lsm_env *pEnv, lsm_file *pFile, lsm_i64 iOff, const void *pWrite, int nWrite ){ return IOERR_WRAPPER( pEnv->xWrite(pFile, iOff, (void *)pWrite, nWrite) ); } static int lsmEnvSync(lsm_env *pEnv, lsm_file *pFile){ return IOERR_WRAPPER( pEnv->xSync(pFile) ); } static int lsmEnvSectorSize(lsm_env *pEnv, lsm_file *pFile){ return pEnv->xSectorSize(pFile); } int lsmEnvClose(lsm_env *pEnv, lsm_file *pFile){ return IOERR_WRAPPER( pEnv->xClose(pFile) ); } static int lsmEnvTruncate(lsm_env *pEnv, lsm_file *pFile, lsm_i64 nByte){ return IOERR_WRAPPER( pEnv->xTruncate(pFile, nByte) ); } static int lsmEnvUnlink(lsm_env *pEnv, const char *zDel){ return IOERR_WRAPPER( pEnv->xUnlink(pEnv, zDel) ); } static int lsmEnvRemap( lsm_env *pEnv, lsm_file *pFile, i64 szMin, void **ppMap, i64 *pszMap |
︙ | ︙ | |||
393 394 395 396 397 398 399 | pFS->zDb = (char *)&pFS[1]; pFS->zLog = &pFS->zDb[nDb+1]; pFS->nPagesize = LSM_DFLT_PAGE_SIZE; pFS->nBlocksize = LSM_DFLT_BLOCK_SIZE; pFS->nMetasize = 4 * 1024; pFS->pDb = pDb; pFS->pEnv = pDb->pEnv; | > > > | > | 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 | pFS->zDb = (char *)&pFS[1]; pFS->zLog = &pFS->zDb[nDb+1]; pFS->nPagesize = LSM_DFLT_PAGE_SIZE; pFS->nBlocksize = LSM_DFLT_BLOCK_SIZE; pFS->nMetasize = 4 * 1024; pFS->pDb = pDb; pFS->pEnv = pDb->pEnv; if( pDb->compress.xCompress ){ pFS->pCompress = &pDb->compress; }else{ pFS->bUseMmap = pDb->bMmap; } /* Make a copy of the database and log file names. */ memcpy(pFS->zDb, zDb, nDb+1); memcpy(pFS->zLog, zDb, nDb); memcpy(&pFS->zLog[nDb], "-log", 5); /* Allocate the hash-table here. At some point, it should be changed |
︙ | ︙ | |||
422 423 424 425 426 427 428 429 430 431 432 433 434 435 | pFS->fdDb = fsOpenFile(pFS, 0, &rc); } } if( rc!=LSM_OK ){ lsmFsClose(pFS); pFS = 0; } } pDb->pFS = pFS; return rc; } | > > | 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 | pFS->fdDb = fsOpenFile(pFS, 0, &rc); } } if( rc!=LSM_OK ){ lsmFsClose(pFS); pFS = 0; }else{ pFS->szSector = lsmEnvSectorSize(pFS->pEnv, pFS->fdDb); } } pDb->pFS = pFS; return rc; } |
︙ | ︙ | |||
450 451 452 453 454 455 456 457 458 459 460 461 462 463 | pPg = pNext; } if( pFS->fdDb ) lsmEnvClose(pFS->pEnv, pFS->fdDb ); if( pFS->fdLog ) lsmEnvClose(pFS->pEnv, pFS->fdLog ); lsmFree(pEnv, pFS->pLsmFile); lsmFree(pEnv, pFS->apHash); lsmFree(pEnv, pFS); } } void lsmFsDeferClose(FileSystem *pFS, LsmFile **pp){ LsmFile *p = pFS->pLsmFile; assert( p->pNext==0 ); | > | 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 | pPg = pNext; } if( pFS->fdDb ) lsmEnvClose(pFS->pEnv, pFS->fdDb ); if( pFS->fdLog ) lsmEnvClose(pFS->pEnv, pFS->fdLog ); lsmFree(pEnv, pFS->pLsmFile); lsmFree(pEnv, pFS->apHash); lsmFree(pEnv, pFS->aBuffer); lsmFree(pEnv, pFS); } } void lsmFsDeferClose(FileSystem *pFS, LsmFile **pp){ LsmFile *p = pFS->pLsmFile; assert( p->pNext==0 ); |
︙ | ︙ | |||
526 527 528 529 530 531 532 533 534 | void lsmFsSetBlockSize(FileSystem *pFS, int nBlocksize){ pFS->nBlocksize = nBlocksize; } /* ** Return the page number of the first page on block iBlock. Blocks are ** numbered starting from 1. */ static Pgno fsFirstPageOnBlock(FileSystem *pFS, int iBlock){ | > > > > > > > > > > > > | < | | | | > > > > > > > > | | > > > > > > > > > > > > > > > > > > > | | | | | > | 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 | void lsmFsSetBlockSize(FileSystem *pFS, int nBlocksize){ pFS->nBlocksize = nBlocksize; } /* ** Return the page number of the first page on block iBlock. Blocks are ** numbered starting from 1. ** ** For a compressed database, page numbers are byte offsets. The first ** page on each block is the byte offset immediately following the 4-byte ** "previous block" pointer at the start of each block. */ static Pgno fsFirstPageOnBlock(FileSystem *pFS, int iBlock){ Pgno iPg; if( pFS->pCompress ){ if( iBlock==1 ){ iPg = pFS->nMetasize * 2 + 4; }else{ iPg = pFS->nBlocksize * (Pgno)(iBlock-1) + 4; } }else{ const int nPagePerBlock = (pFS->nBlocksize / pFS->nPagesize); if( iBlock==1 ){ iPg = 1 + ((pFS->nMetasize*2 + pFS->nPagesize - 1) / pFS->nPagesize); }else{ iPg = 1 + (iBlock-1) * nPagePerBlock; } } return iPg; } /* ** Return the page number of the last page on block iBlock. Blocks are ** numbered starting from 1. ** ** For a compressed database, page numbers are byte offsets. The first ** page on each block is the byte offset of the byte immediately before ** the 4-byte "next block" pointer at the end of each block. */ static Pgno fsLastPageOnBlock(FileSystem *pFS, int iBlock){ if( pFS->pCompress ){ return pFS->nBlocksize * (Pgno)iBlock - 1 - 4; }else{ const int nPagePerBlock = (pFS->nBlocksize / pFS->nPagesize); return iBlock * nPagePerBlock; } } /* ** Return the block number of the block that page iPg is located on. ** Blocks are numbered starting from 1. */ static int fsPageToBlock(FileSystem *pFS, Pgno iPg){ if( pFS->pCompress ){ return (iPg / pFS->nBlocksize) + 1; }else{ return 1 + ((iPg-1) / (pFS->nBlocksize / pFS->nPagesize)); } } /* ** Return true if page iPg is the last page on its block. ** ** This function is only called in non-compressed database mode. */ static int fsIsLast(FileSystem *pFS, Pgno iPg){ const int nPagePerBlock = (pFS->nBlocksize / pFS->nPagesize); assert( !pFS->pCompress ); return ( iPg && (iPg % nPagePerBlock)==0 ); } /* ** Return true if page iPg is the first page on its block. ** ** This function is only called in non-compressed database mode. */ static int fsIsFirst(FileSystem *pFS, Pgno iPg){ const int nPagePerBlock = (pFS->nBlocksize / pFS->nPagesize); assert( !pFS->pCompress ); return ( (iPg % nPagePerBlock)==1 || (iPg<nPagePerBlock && iPg==fsFirstPageOnBlock(pFS, 1)) ); } /* ** Given a page reference, return a pointer to the in-memory buffer of the ** pages contents. If parameter pnData is not NULL, set *pnData to the size ** of the buffer in bytes before returning. */ u8 *lsmFsPageData(Page *pPage, int *pnData){ if( pnData ){ #ifndef NDEBUG int bShort = (pPage->pFS->pCompress==0 && (fsIsFirst(pPage->pFS, pPage->iPg) || fsIsLast(pPage->pFS, pPage->iPg)) ); assert( bShort==!!(pPage->flags & PAGE_SHORT) ); assert( PAGE_SHORT==4 ); #endif *pnData = pPage->pFS->nPagesize - (pPage->flags & PAGE_SHORT); } return pPage->aData; } /* ** Return the page number of a page. */ Pgno lsmFsPageNumber(Page *pPage){ assert( (pPage->flags & PAGE_DIRTY)==0 ); return pPage ? pPage->iPg : 0; } /* ** Page pPg is currently part of the LRU list belonging to pFS. Remove ** it from the list. pPg->pLruNext and pPg->pLruPrev are cleared by this ** operation. |
︙ | ︙ | |||
683 684 685 686 687 688 689 | lsmFree(pPg->pFS->pEnv, pPg->aData); } else if( pPg->pFS->bUseMmap ){ fsPageRemoveFromLru(pPg->pFS, pPg); } lsmFree(pPg->pFS->pEnv, pPg); } | < < < < < > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | | > | 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074 1075 1076 1077 1078 1079 1080 1081 1082 1083 1084 1085 1086 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 | lsmFree(pPg->pFS->pEnv, pPg->aData); } else if( pPg->pFS->bUseMmap ){ fsPageRemoveFromLru(pPg->pFS, pPg); } lsmFree(pPg->pFS->pEnv, pPg); } static void fsGrowMapping( FileSystem *pFS, i64 iSz, int *pRc ){ /* This function won't work with compressed databases yet. */ assert( pFS->pCompress==0 ); if( *pRc==LSM_OK && iSz>pFS->nMap ){ Page *pFix; int rc; u8 *aOld = pFS->pMap; rc = lsmEnvRemap(pFS->pEnv, pFS->fdDb, iSz, &pFS->pMap, &pFS->nMap); if( rc==LSM_OK && pFS->pMap!=aOld ){ u8 *aData = (u8 *)pFS->pMap; for(pFix=pFS->pLruFirst; pFix; pFix=pFix->pLruNext){ assert( &aOld[pFS->nPagesize * (i64)(pFix->iPg-1)]==pFix->aData ); pFix->aData = &aData[pFS->nPagesize * (i64)(pFix->iPg-1)]; } lsmSortedRemap(pFS->pDb); } *pRc = rc; } } static int fsPageGet(FileSystem *, Pgno, int, Page **, int *); /* ** Parameter iBlock is a database file block. This function reads the value ** stored in the blocks "next block" pointer and stores it in *piNext. ** LSM_OK is returned if everything is successful, or an LSM error code ** otherwise. */ static int fsBlockNext( FileSystem *pFS, /* File-system object handle */ int iBlock, /* Read field from this block */ int *piNext /* OUT: Next block in linked list */ ){ int rc; assert( pFS->bUseMmap==0 || pFS->pCompress==0 ); if( pFS->pCompress ){ i64 iOff; /* File offset to read data from */ u8 aNext[4]; /* 4-byte pointer read from db file */ iOff = (i64)iBlock * pFS->nBlocksize - sizeof(aNext); rc = lsmEnvRead(pFS->pEnv, pFS->fdDb, iOff, aNext, sizeof(aNext)); if( rc==LSM_OK ){ *piNext = (int)lsmGetU32(aNext); } }else{ const int nPagePerBlock = (pFS->nBlocksize / pFS->nPagesize); Page *pLast; rc = fsPageGet(pFS, iBlock*nPagePerBlock, 0, &pLast, 0); if( rc==LSM_OK ){ *piNext = fsPageToBlock(pFS, lsmGetU32(&pLast->aData[pFS->nPagesize-4])); lsmFsPageRelease(pLast); } } return rc; } /* ** Return the page number of the last page on the same block as page iPg. */ Pgno fsLastPageOnPagesBlock(FileSystem *pFS, Pgno iPg){ return fsLastPageOnBlock(pFS, fsPageToBlock(pFS, iPg)); } /* ** This function is only called in compressed database mode. */ static int fsReadData( FileSystem *pFS, /* File-system handle */ i64 iOff, /* Read data from this offset */ u8 *aData, /* Buffer to read data into */ int nData /* Number of bytes to read */ ){ i64 iEob; /* End of block */ int nRead; int rc; assert( pFS->pCompress ); iEob = fsLastPageOnPagesBlock(pFS, iOff) + 1; nRead = LSM_MIN(iEob - iOff, nData); rc = lsmEnvRead(pFS->pEnv, pFS->fdDb, iOff, aData, nRead); if( rc==LSM_OK && nRead!=nData ){ int iBlk; rc = fsBlockNext(pFS, fsPageToBlock(pFS, iOff), &iBlk); if( rc==LSM_OK ){ i64 iOff2 = fsFirstPageOnBlock(pFS, iBlk); rc = lsmEnvRead(pFS->pEnv, pFS->fdDb, iOff2, &aData[nRead], nData-nRead); } } return rc; } /* ** Parameter iBlock is a database file block. This function reads the value ** stored in the blocks "previous block" pointer and stores it in *piPrev. ** LSM_OK is returned if everything is successful, or an LSM error code ** otherwise. */ static int fsBlockPrev( FileSystem *pFS, /* File-system object handle */ int iBlock, /* Read field from this block */ int *piPrev /* OUT: Previous block in linked list */ ){ int rc = LSM_OK; /* Return code */ assert( pFS->bUseMmap==0 || pFS->pCompress==0 ); assert( iBlock>0 ); if( pFS->pCompress ){ i64 iOff = fsFirstPageOnBlock(pFS, iBlock) - 4; u8 aPrev[4]; /* 4-byte pointer read from db file */ rc = lsmEnvRead(pFS->pEnv, pFS->fdDb, iOff, aPrev, sizeof(aPrev)); if( rc==LSM_OK ){ *piPrev = (int)lsmGetU32(aPrev); } }else{ assert( 0 ); } return rc; } /* ** Encode and decode routines for record size fields. */ static void putRecordSize(u8 *aBuf, int nByte, int bFree){ aBuf[0] = (u8)(nByte >> 14) | 0x80; aBuf[1] = ((u8)(nByte >> 7) & 0x7F) | (bFree ? 0x00 : 0x80); aBuf[2] = (u8)nByte | 0x80; } static int getRecordSize(u8 *aBuf, int *pbFree){ int nByte; nByte = (aBuf[0] & 0x7F) << 14; nByte += (aBuf[1] & 0x7F) << 7; nByte += (aBuf[2] & 0x7F); *pbFree = !(aBuf[1] & 0x80); return nByte; } static int fsSubtractOffset(FileSystem *pFS, i64 iOff, int iSub, i64 *piRes){ i64 iStart; int iBlk; int rc; assert( pFS->pCompress ); iStart = fsFirstPageOnBlock(pFS, fsPageToBlock(pFS, iOff)); if( (iOff-iSub)>=iStart ){ *piRes = (iOff-iSub); return LSM_OK; } rc = fsBlockPrev(pFS, fsPageToBlock(pFS, iOff), &iBlk); *piRes = fsLastPageOnBlock(pFS, iBlk) - iSub + (iOff - iStart + 1); return rc; } static int fsAddOffset(FileSystem *pFS, i64 iOff, int iAdd, i64 *piRes){ i64 iEob; int iBlk; int rc; assert( pFS->pCompress ); iEob = fsLastPageOnPagesBlock(pFS, iOff); if( (iOff+iAdd)<=iEob ){ *piRes = (iOff+iAdd); return LSM_OK; } rc = fsBlockNext(pFS, fsPageToBlock(pFS, iOff), &iBlk); *piRes = fsFirstPageOnBlock(pFS, iBlk) + iAdd - (iEob - iOff + 1); return rc; } static int fsAllocateBuffer(FileSystem *pFS){ assert( pFS->pCompress ); if( pFS->aBuffer==0 ){ pFS->nBuffer = pFS->pCompress->xBound(pFS->pCompress->pCtx, pFS->nPagesize); if( pFS->nBuffer<(pFS->szSector+6) ){ pFS->nBuffer = pFS->szSector+6; } pFS->aBuffer = lsmMalloc(pFS->pEnv, LSM_MAX(pFS->nBuffer, pFS->nPagesize)); if( pFS->aBuffer==0 ) return LSM_NOMEM_BKPT; } return LSM_OK; } /* ** This function is only called in compressed database mode. It reads and ** uncompresses the compressed data for page pPg from the database and ** populates the pPg->aData[] buffer and pPg->nCompress field. ** ** LSM_OK is returned if successful, or an LSM error code otherwise. */ static int fsReadPagedata( FileSystem *pFS, /* File-system handle */ Page *pPg, /* Page to read and uncompress data for */ int *pnSpace /* OUT: Total bytes of free space */ ){ lsm_compress *p = pFS->pCompress; i64 iOff = pPg->iPg; u8 aSz[3]; int rc; assert( p && pPg->nCompress==0 ); if( fsAllocateBuffer(pFS) ) return LSM_NOMEM; rc = fsReadData(pFS, iOff, aSz, sizeof(aSz)); if( rc==LSM_OK ){ int bFree; if( aSz[0] & 0x80 ){ pPg->nCompress = (int)getRecordSize(aSz, &bFree); }else{ pPg->nCompress = (int)aSz[0] - sizeof(aSz)*2; bFree = 1; } if( bFree ){ if( pnSpace ){ *pnSpace = pPg->nCompress + sizeof(aSz)*2; }else{ rc = LSM_CORRUPT_BKPT; } }else{ rc = fsAddOffset(pFS, iOff, 3, &iOff); if( rc==LSM_OK ){ if( pPg->nCompress>pFS->nBuffer ){ rc = LSM_CORRUPT_BKPT; }else{ rc = fsReadData(pFS, iOff, pFS->aBuffer, pPg->nCompress); } if( rc==LSM_OK ){ int n = pFS->nPagesize; rc = p->xUncompress(p->pCtx, (char *)pPg->aData, &n, (const char *)pFS->aBuffer, pPg->nCompress ); if( rc==LSM_OK && n!=pPg->nData ){ rc = LSM_CORRUPT_BKPT; } } } } } return rc; } /* ** Return a handle for a database page. */ static int fsPageGet( FileSystem *pFS, /* File-system handle */ Pgno iPg, /* Page id */ int noContent, /* True to not load content from disk */ Page **ppPg, /* OUT: New page handle */ int *pnSpace /* OUT: Bytes of free space */ ){ Page *p; int iHash; int rc = LSM_OK; assert( iPg>=fsFirstPageOnBlock(pFS, 1) ); *ppPg = 0; |
︙ | ︙ | |||
761 762 763 764 765 766 767 768 769 770 771 | for(p=pFS->apHash[iHash]; p; p=p->pHashNext){ if( p->iPg==iPg) break; } if( p==0 ){ rc = fsPageBuffer(pFS, 1, &p); if( rc==LSM_OK ){ p->iPg = iPg; p->nRef = 0; p->pFS = pFS; assert( p->flags==0 || p->flags==PAGE_FREE ); | > | > > > > > > | < < | | > | < > > | > | < < < < < < < < < < < < < < < < | | | | | | 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 1178 1179 1180 1181 1182 1183 1184 1185 1186 1187 1188 1189 1190 1191 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 | for(p=pFS->apHash[iHash]; p; p=p->pHashNext){ if( p->iPg==iPg) break; } if( p==0 ){ rc = fsPageBuffer(pFS, 1, &p); if( rc==LSM_OK ){ int nSpace = 0; p->iPg = iPg; p->nRef = 0; p->pFS = pFS; assert( p->flags==0 || p->flags==PAGE_FREE ); if( pFS->pCompress==0 && (fsIsLast(pFS, iPg) || fsIsFirst(pFS, iPg)) ){ p->flags |= PAGE_SHORT; } p->nData = pFS->nPagesize - (p->flags & PAGE_SHORT); #ifdef LSM_DEBUG memset(p->aData, 0x56, pFS->nPagesize); #endif assert( p->pLruNext==0 && p->pLruPrev==0 ); if( noContent==0 ){ if( pFS->pCompress ){ rc = fsReadPagedata(pFS, p, &nSpace); }else{ int nByte = pFS->nPagesize; i64 iOff = (i64)(iPg-1) * pFS->nPagesize; rc = lsmEnvRead(pFS->pEnv, pFS->fdDb, iOff, p->aData, nByte); } pFS->nRead++; } /* If the xRead() call was successful (or not attempted), link the ** page into the page-cache hash-table. Otherwise, if it failed, ** free the buffer. */ if( rc==LSM_OK && nSpace==0 ){ p->pHashNext = pFS->apHash[iHash]; pFS->apHash[iHash] = p; }else{ fsPageBufferFree(p); p = 0; if( pnSpace ) *pnSpace = nSpace; } } }else if( p->nRef==0 ){ fsPageRemoveFromLru(pFS, p); } assert( (rc==LSM_OK && (p || (pnSpace && *pnSpace))) || (rc!=LSM_OK && p==0) ); } if( rc==LSM_OK && p ){ pFS->nOut += (p->nRef==0); p->nRef++; } *ppPg = p; return rc; } static int fsRunEndsBetween( Segment *pRun, Segment *pIgnore, Pgno iFirst, Pgno iLast ){ return (pRun!=pIgnore && ( (pRun->iFirst>=iFirst && pRun->iFirst<=iLast) || (pRun->iLastPg>=iFirst && pRun->iLastPg<=iLast) )); } static int fsLevelEndsBetween( Level *pLevel, Segment *pIgnore, Pgno iFirst, Pgno iLast ){ int i; if( fsRunEndsBetween(&pLevel->lhs, pIgnore, iFirst, iLast) ){ return 1; } for(i=0; i<pLevel->nRight; i++){ |
︙ | ︙ | |||
868 869 870 871 872 873 874 | int rc = LSM_OK; /* Return code */ int iFirst; /* First page on block iBlk */ int iLast; /* Last page on block iBlk */ Level *pLevel; /* Used to iterate through levels */ int iIn; /* Used to iterate through append points */ int iOut = 0; /* Used to output append points */ | | | 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 | int rc = LSM_OK; /* Return code */ int iFirst; /* First page on block iBlk */ int iLast; /* Last page on block iBlk */ Level *pLevel; /* Used to iterate through levels */ int iIn; /* Used to iterate through append points */ int iOut = 0; /* Used to output append points */ Pgno *aApp = pSnapshot->aiAppend; iFirst = fsFirstPageOnBlock(pFS, iBlk); iLast = fsLastPageOnBlock(pFS, iBlk); /* Check if any other run in the snapshot has a start or end page ** within this block. If there is such a run, return early. */ for(pLevel=lsmDbSnapshotLevel(pSnapshot); pLevel; pLevel=pLevel->pNext){ |
︙ | ︙ | |||
910 911 912 913 914 915 916 | if( pDel->iFirst ){ int rc = LSM_OK; int iBlk; int iLastBlk; iBlk = fsPageToBlock(pFS, pDel->iFirst); | | | | 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 | if( pDel->iFirst ){ int rc = LSM_OK; int iBlk; int iLastBlk; iBlk = fsPageToBlock(pFS, pDel->iFirst); iLastBlk = fsPageToBlock(pFS, pDel->iLastPg); /* Mark all blocks currently used by this sorted run as free */ while( iBlk && rc==LSM_OK ){ int iNext = 0; if( iBlk!=iLastBlk ){ rc = fsBlockNext(pFS, iBlk, &iNext); }else if( bZero==0 && pDel->iLastPg!=fsLastPageOnBlock(pFS, iLastBlk) ){ break; } rc = fsFreeBlock(pFS, pSnapshot, pDel, iBlk); iBlk = iNext; } if( bZero ) memset(pDel, 0, sizeof(Segment)); |
︙ | ︙ | |||
941 942 943 944 945 946 947 | if( fsPageToBlock(pFS, iPg)==iBlk && (iRet==0 || iPg<iRet) ){ iRet = iPg; } } return iRet; } | < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < | 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 | if( fsPageToBlock(pFS, iPg)==iBlk && (iRet==0 || iPg<iRet) ){ iRet = iPg; } } return iRet; } /* ** Argument aPgno is an array of nPgno page numbers. All pages belong to ** the segment pRun. This function gobbles from the start of the run to the ** first page that appears in aPgno[] (i.e. so that the aPgno[] entry is ** the new first page of the run). */ void lsmFsGobble( |
︙ | ︙ | |||
1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 | iBlk = iNext; } pRun->nSize -= (pRun->iFirst - fsFirstPageOnBlock(pFS, iBlk)); assert( pRun->nSize>0 ); } /* ** The first argument to this function is a valid reference to a database ** file page that is part of a sorted run. If parameter eDir is -1, this ** function attempts to locate and load the previous page in the same run. ** Or, if eDir is +1, it attempts to find the next page in the same run. ** The results of passing an eDir value other than positive or negative one | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 | iBlk = iNext; } pRun->nSize -= (pRun->iFirst - fsFirstPageOnBlock(pFS, iBlk)); assert( pRun->nSize>0 ); } static int fsNextPageOffset( FileSystem *pFS, /* File system object */ Segment *pSeg, /* Segment to move within */ Pgno iPg, /* Offset of current page */ int nByte, /* Size of current page including headers */ Pgno *piNext /* OUT: Offset of next page. Or zero (EOF) */ ){ Pgno iNext; int rc; assert( pFS->pCompress ); rc = fsAddOffset(pFS, iPg, nByte-1, &iNext); if( pSeg && iNext==pSeg->iLastPg ){ iNext = 0; }else if( rc==LSM_OK ){ rc = fsAddOffset(pFS, iNext, 1, &iNext); } *piNext = iNext; return rc; } static int fsGetPageBefore(FileSystem *pFS, i64 iOff, Pgno *piPrev){ u8 aSz[3]; int rc; i64 iRead; rc = fsSubtractOffset(pFS, iOff, sizeof(aSz), &iRead); if( rc==LSM_OK ) rc = fsReadData(pFS, iRead, aSz, sizeof(aSz)); if( rc==LSM_OK ){ int bFree; int nSz; if( aSz[2] & 0x80 ){ nSz = getRecordSize(aSz, &bFree) + sizeof(aSz)*2; }else{ nSz = (int)(aSz[2] & 0x7F); bFree = 1; } rc = fsSubtractOffset(pFS, iOff, nSz, piPrev); } return rc; } /* ** The first argument to this function is a valid reference to a database ** file page that is part of a sorted run. If parameter eDir is -1, this ** function attempts to locate and load the previous page in the same run. ** Or, if eDir is +1, it attempts to find the next page in the same run. ** The results of passing an eDir value other than positive or negative one |
︙ | ︙ | |||
1034 1035 1036 1037 1038 1039 1040 1041 | ** is set to point to it and LSM_OK is returned. Otherwise, if an error ** occurs, *ppNext is set to NULL and and lsm error code returned. ** ** Page references returned by this function should be released by the ** caller using lsmFsPageRelease(). */ int lsmFsDbPageNext(Segment *pRun, Page *pPg, int eDir, Page **ppNext){ FileSystem *pFS = pPg->pFS; | > | > > > > > > > > > > > > > > > > > > > > > > > > | | | | | | | | | | | | | | | | | | | | > | > | | | | > | > > > > > > > > > > > > > > > > > | | | | | | | < | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | > | | > > > > > > | > > > > | < | | | < | < | < | | > > > > > > > > > > > > > > > > > > > > > > > > | 1416 1417 1418 1419 1420 1421 1422 1423 1424 1425 1426 1427 1428 1429 1430 1431 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 1506 1507 1508 1509 1510 1511 1512 1513 1514 1515 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 1545 1546 1547 1548 1549 1550 1551 1552 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 1568 1569 1570 1571 1572 1573 1574 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 1590 1591 1592 1593 1594 1595 1596 1597 1598 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 1614 1615 1616 1617 1618 1619 1620 1621 1622 1623 1624 1625 1626 1627 1628 1629 1630 1631 1632 1633 1634 1635 1636 1637 1638 1639 1640 1641 1642 1643 1644 1645 1646 1647 1648 1649 1650 1651 1652 1653 | ** is set to point to it and LSM_OK is returned. Otherwise, if an error ** occurs, *ppNext is set to NULL and and lsm error code returned. ** ** Page references returned by this function should be released by the ** caller using lsmFsPageRelease(). */ int lsmFsDbPageNext(Segment *pRun, Page *pPg, int eDir, Page **ppNext){ int rc = LSM_OK; FileSystem *pFS = pPg->pFS; Pgno iPg = pPg->iPg; if( pFS->pCompress ){ int nSpace = pPg->nCompress + 2*3; do { if( eDir>0 ){ rc = fsNextPageOffset(pFS, pRun, iPg, nSpace, &iPg); }else{ if( iPg==pRun->iFirst ){ iPg = 0; }else{ rc = fsGetPageBefore(pFS, iPg, &iPg); } } nSpace = 0; if( iPg!=0 ){ rc = fsPageGet(pFS, iPg, 0, ppNext, &nSpace); assert( (*ppNext==0)==(rc!=LSM_OK || nSpace>0) ); }else{ *ppNext = 0; } }while( nSpace>0 && rc==LSM_OK ); }else{ assert( eDir==1 || eDir==-1 ); if( eDir<0 ){ if( pRun && iPg==pRun->iFirst ){ *ppNext = 0; return LSM_OK; }else if( fsIsFirst(pFS, iPg) ){ iPg = lsmGetU32(&pPg->aData[pFS->nPagesize-4]); }else{ iPg--; } }else{ if( pRun && iPg==pRun->iLastPg ){ *ppNext = 0; return LSM_OK; }else if( fsIsLast(pFS, iPg) ){ iPg = lsmGetU32(&pPg->aData[pFS->nPagesize-4]); }else{ iPg++; } } rc = fsPageGet(pFS, iPg, 0, ppNext, 0); } return rc; } static Pgno findAppendPoint(FileSystem *pFS){ int i; Pgno *aiAppend = pFS->pDb->pWorker->aiAppend; u32 iRet = 0; for(i=LSM_APPLIST_SZ-1; iRet==0 && i>=0; i--){ if( (iRet = aiAppend[i]) ) aiAppend[i] = 0; } return iRet; } /* ** Append a page to file iFile. Set the ref-count to 1 and return a pointer ** to it. The page is writable until either lsmFsPagePersist() is called on ** it or the ref-count drops to zero. */ int lsmFsSortedAppend( FileSystem *pFS, Snapshot *pSnapshot, Segment *p, Page **ppOut ){ int rc = LSM_OK; Page *pPg = 0; *ppOut = 0; int iApp = 0; int iNext = 0; int iPrev = p->iLastPg; if( pFS->pCompress ){ /* In compressed database mode the page is not assigned a page number ** or location in the database file at this point. This will be done ** by the lsmFsPagePersist() call. */ rc = fsPageBuffer(pFS, 1, &pPg); if( rc==LSM_OK ){ pPg->pFS = pFS; pPg->pSeg = p; pPg->iPg = 0; pPg->flags |= PAGE_DIRTY; pPg->nData = pFS->nPagesize; assert( pPg->aData ); pPg->nRef = 1; pFS->nOut++; } }else{ if( iPrev==0 ){ iApp = findAppendPoint(pFS); }else if( fsIsLast(pFS, iPrev) ){ int iNext; rc = fsBlockNext(pFS, fsPageToBlock(pFS, iPrev), &iNext); if( rc!=LSM_OK ) return rc; iApp = fsFirstPageOnBlock(pFS, iNext); }else{ iApp = iPrev + 1; } /* If this is the first page allocated, or if the page allocated is the ** last in the block, allocate a new block here. */ if( iApp==0 || fsIsLast(pFS, iApp) ){ int iNew; /* New block number */ lsmBlockAllocate(pFS->pDb, &iNew); if( iApp==0 ){ iApp = fsFirstPageOnBlock(pFS, iNew); }else{ iNext = fsFirstPageOnBlock(pFS, iNew); } } /* Grab the new page. */ pPg = 0; rc = fsPageGet(pFS, iApp, 1, &pPg, 0); assert( rc==LSM_OK || pPg==0 ); /* If this is the first or last page of a block, fill in the pointer ** value at the end of the new page. */ if( rc==LSM_OK ){ p->nSize++; p->iLastPg = iApp; if( p->iFirst==0 ) p->iFirst = iApp; pPg->flags |= PAGE_DIRTY; if( fsIsLast(pFS, iApp) ){ lsmPutU32(&pPg->aData[pFS->nPagesize-4], iNext); }else if( fsIsFirst(pFS, iApp) ){ lsmPutU32(&pPg->aData[pFS->nPagesize-4], iPrev); } } } *ppOut = pPg; return rc; } /* ** Mark the sorted run passed as the second argument as finished. */ int lsmFsSortedFinish(FileSystem *pFS, Segment *p){ int rc = LSM_OK; if( p && p->iLastPg ){ int iBlk; /* Check if the last page of this run happens to be the last of a block. ** If it is, then an extra block has already been allocated for this run. ** Shift this extra block back to the free-block list. ** ** Otherwise, add the first free page in the last block used by the run ** to the lAppend list. */ iBlk = fsPageToBlock(pFS, p->iLastPg); if( fsLastPageOnPagesBlock(pFS, p->iLastPg)!=p->iLastPg ){ int i; Pgno *aiAppend = pFS->pDb->pWorker->aiAppend; for(i=0; i<LSM_APPLIST_SZ; i++){ if( aiAppend[i]==0 ){ aiAppend[i] = p->iLastPg+1; break; } } }else if( pFS->pCompress==0 ){ Page *pLast; rc = fsPageGet(pFS, p->iLastPg, 0, &pLast, 0); if( rc==LSM_OK ){ int iPg = (int)lsmGetU32(&pLast->aData[pFS->nPagesize-4]); lsmBlockRefree(pFS->pDb, fsPageToBlock(pFS, iPg)); lsmFsPageRelease(pLast); } }else{ int iBlk = 0; rc = fsBlockNext(pFS, fsPageToBlock(pFS, p->iLastPg), &iBlk); if( rc==LSM_OK ){ lsmBlockRefree(pFS->pDb, iBlk); } } } return rc; } /* ** Obtain a reference to page number iPg. */ int lsmFsDbPageGet(FileSystem *pFS, Pgno iPg, Page **ppPg){ assert( pFS ); return fsPageGet(pFS, iPg, 0, ppPg, 0); } /* ** Obtain a reference to the last page in the segment passed as the ** second argument. */ int lsmFsDbPageLast(FileSystem *pFS, Segment *pSeg, Page **ppPg){ int rc; Pgno iPg = pSeg->iLastPg; if( pFS->pCompress ){ int nSpace; iPg++; do { nSpace = 0; rc = fsGetPageBefore(pFS, iPg, &iPg); if( rc==LSM_OK ){ rc = fsPageGet(pFS, iPg, 0, ppPg, &nSpace); } }while( rc==LSM_OK && nSpace>0 ); }else{ rc = fsPageGet(pFS, iPg, 0, ppPg, 0); } return rc; } /* ** Return a reference to meta-page iPg. If successful, LSM_OK is returned ** and *ppPg populated with the new page reference. The reference should ** be released by the caller using lsmFsPageRelease(). ** |
︙ | ︙ | |||
1274 1275 1276 1277 1278 1279 1280 | ** set *pnData to the size of the meta-page in bytes before returning. */ u8 *lsmFsMetaPageData(MetaPage *pPg, int *pnData){ if( pnData ) *pnData = pPg->pFS->nMetasize; return pPg->aData; } | < < < < < < < < < > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | > > > > > > > > > > > > > > > > > > > > > > > > | | | | | | | | | | | | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 1768 1769 1770 1771 1772 1773 1774 1775 1776 1777 1778 1779 1780 1781 1782 1783 1784 1785 1786 1787 1788 1789 1790 1791 1792 1793 1794 1795 1796 1797 1798 1799 1800 1801 1802 1803 1804 1805 1806 1807 1808 1809 1810 1811 1812 1813 1814 1815 1816 1817 1818 1819 1820 1821 1822 1823 1824 1825 1826 1827 1828 1829 1830 1831 1832 1833 1834 1835 1836 1837 1838 1839 1840 1841 1842 1843 1844 1845 1846 1847 1848 1849 1850 1851 1852 1853 1854 1855 1856 1857 1858 1859 1860 1861 1862 1863 1864 1865 1866 1867 1868 1869 1870 1871 1872 1873 1874 1875 1876 1877 1878 1879 1880 1881 1882 1883 1884 1885 1886 1887 1888 1889 1890 1891 1892 1893 1894 1895 1896 1897 1898 1899 1900 1901 1902 1903 1904 1905 1906 1907 1908 1909 1910 1911 1912 1913 1914 1915 1916 1917 1918 1919 1920 1921 1922 1923 1924 1925 1926 1927 1928 1929 1930 1931 1932 1933 1934 1935 1936 1937 1938 1939 1940 1941 1942 1943 1944 1945 1946 1947 1948 1949 1950 1951 1952 1953 1954 1955 1956 1957 1958 1959 1960 1961 1962 1963 1964 1965 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 1981 | ** set *pnData to the size of the meta-page in bytes before returning. */ u8 *lsmFsMetaPageData(MetaPage *pPg, int *pnData){ if( pnData ) *pnData = pPg->pFS->nMetasize; return pPg->aData; } /* ** Return true if page is currently writable. */ int lsmFsPageWritable(Page *pPg){ return (pPg->flags & PAGE_DIRTY) ? 1 : 0; } /* ** Append raw data to a segment. This function is only used in compressed ** database mode. */ static Pgno fsAppendData( FileSystem *pFS, /* File-system handle */ Segment *pSeg, /* Segment to append to */ const u8 *aData, /* Buffer containing data to write */ int nData, /* Size of buffer aData[] in bytes */ int *pRc /* IN/OUT: Error code */ ){ Pgno iRet = 0; int rc = *pRc; assert( pFS->pCompress ); if( rc==LSM_OK ){ int nRem; int nWrite; Pgno iLastOnBlock; Pgno iApp = pSeg->iLastPg+1; /* If this is the first data written into the segment, find an append-point ** or allocate a new block. */ if( iApp==1 ){ pSeg->iFirst = iApp = findAppendPoint(pFS); if( iApp==0 ){ int iBlk; rc = lsmBlockAllocate(pFS->pDb, &iBlk); pSeg->iFirst = iApp = fsFirstPageOnBlock(pFS, iBlk); } } iRet = iApp; /* Write as much data as is possible at iApp (usually all of it). */ iLastOnBlock = fsLastPageOnPagesBlock(pFS, iApp); if( rc==LSM_OK ){ int nSpace = iLastOnBlock - iApp + 1; nWrite = LSM_MIN(nData, nSpace); nRem = nData - nWrite; assert( nWrite>=0 ); if( nWrite!=0 ){ rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iApp, aData, nWrite); } iApp += nWrite; } /* If required, allocate a new block and write the rest of the data ** into it. Set the next and previous block pointers to link the new ** block to the old. */ assert( nRem<=0 || (iApp-1)==iLastOnBlock ); if( rc==LSM_OK && (iApp-1)==iLastOnBlock ){ u8 aPtr[4]; /* Space to serialize a u32 */ int iBlk; /* New block number */ if( nWrite>0 ){ /* Allocate a new block. */ rc = lsmBlockAllocate(pFS->pDb, &iBlk); /* Set the "next" pointer on the old block */ if( rc==LSM_OK ){ assert( iApp==(fsPageToBlock(pFS, iApp)*pFS->nBlocksize)-4 ); lsmPutU32(aPtr, iBlk); rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iApp, aPtr, sizeof(aPtr)); } /* Set the "prev" pointer on the new block */ if( rc==LSM_OK ){ Pgno iWrite; lsmPutU32(aPtr, fsPageToBlock(pFS, iApp)); iWrite = fsFirstPageOnBlock(pFS, iBlk); rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iWrite-4, aPtr, sizeof(aPtr)); if( nRem>0 ) iApp = iWrite; } }else{ /* The next block is already allocated. */ assert( nRem>0 ); rc = fsBlockNext(pFS, fsPageToBlock(pFS, iApp), &iBlk); iApp = fsFirstPageOnBlock(pFS, iBlk); } /* Write the remaining data into the new block */ if( rc==LSM_OK && nRem>0 ){ rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iApp, &aData[nWrite], nRem); iApp += nRem; } } pSeg->iLastPg = iApp-1; *pRc = rc; } return iRet; } /* ** This function is only called in compressed database mode. It ** compresses the contents of page pPg and writes the result to the ** buffer at pFS->aBuffer. The size of the compressed data is stored in ** pPg->nCompress. ** ** If buffer pFS->aBuffer[] has not been allocated then this function ** allocates it. If this fails, LSM_NOMEM is returned. Otherwise, LSM_OK. */ static int fsCompressIntoBuffer(FileSystem *pFS, Page *pPg){ lsm_compress *p = pFS->pCompress; if( fsAllocateBuffer(pFS) ) return LSM_NOMEM; assert( pPg->nData==pFS->nPagesize ); pPg->nCompress = pFS->nBuffer; return p->xCompress(p->pCtx, (char *)pFS->aBuffer, &pPg->nCompress, (const char *)pPg->aData, pPg->nData ); } /* ** If the page passed as an argument is dirty, update the database file ** (or mapping of the database file) with its current contents and mark ** the page as clean. ** ** Return LSM_OK if the operation is a success, or an LSM error code ** otherwise. */ int lsmFsPagePersist(Page *pPg){ int rc = LSM_OK; if( pPg && (pPg->flags & PAGE_DIRTY) ){ FileSystem *pFS = pPg->pFS; if( pFS->pCompress ){ int iHash; /* Hash key of assigned page number */ u8 aSz[3]; /* pPg->nCompress as a 24-bit big-endian */ assert( pPg->pSeg && pPg->iPg==0 && pPg->nCompress==0 ); /* Compress the page image. */ rc = fsCompressIntoBuffer(pFS, pPg); /* Serialize the compressed size into buffer aSz[] */ putRecordSize(aSz, pPg->nCompress, 0); /* Write the serialized page record into the database file. */ pPg->iPg = fsAppendData(pFS, pPg->pSeg, aSz, sizeof(aSz), &rc); fsAppendData(pFS, pPg->pSeg, pFS->aBuffer, pPg->nCompress, &rc); fsAppendData(pFS, pPg->pSeg, aSz, sizeof(aSz), &rc); /* Now that it has a page number, insert the page into the hash table */ iHash = fsHashKey(pFS->nHash, pPg->iPg); pPg->pHashNext = pFS->apHash[iHash]; pFS->apHash[iHash] = pPg; pPg->pSeg->nSize += (sizeof(aSz) * 2) + pPg->nCompress; }else{ i64 iOff; /* Offset to write within database file */ iOff = (i64)pFS->nPagesize * (i64)(pPg->iPg-1); if( pFS->bUseMmap==0 ){ rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iOff, pPg->aData,pFS->nPagesize); }else if( pPg->flags & PAGE_FREE ){ fsGrowMapping(pFS, iOff + pFS->nPagesize, &rc); if( rc==LSM_OK ){ u8 *aTo = &((u8 *)(pFS->pMap))[iOff]; memcpy(aTo, pPg->aData, pFS->nPagesize); lsmFree(pFS->pEnv, pPg->aData); pPg->aData = aTo; pPg->flags &= ~PAGE_FREE; fsPageAddToLru(pFS, pPg); } } } pPg->flags &= ~PAGE_DIRTY; pFS->nWrite++; } return rc; } /* ** For non-compressed databases, this function is a no-op. For compressed ** databases, it adds a padding record to the segment passed as the third ** argument. ** ** The size of the padding records is selected so that the last byte ** written is the last byte of a disk sector. This means that if a ** snapshot is taken and checkpointed, subsequent worker processes will ** not write to any sector that contains checkpointed data. */ int lsmFsSortedPadding( FileSystem *pFS, Snapshot *pSnapshot, Segment *pSeg ){ int rc = LSM_OK; if( pFS->pCompress ){ Pgno iLast2; Pgno iLast = pSeg->iLastPg; /* Current last page of segment */ int nPad; /* Bytes of padding required */ u8 aSz[3]; iLast2 = (1 + iLast/pFS->szSector) * pFS->szSector - 1; assert( fsPageToBlock(pFS, iLast)==fsPageToBlock(pFS, iLast2) ); nPad = iLast2 - iLast; if( iLast2>fsLastPageOnPagesBlock(pFS, iLast) ){ nPad -= 4; } assert( nPad>=0 ); if( nPad>=6 ){ pSeg->nSize += nPad; nPad -= 6; putRecordSize(aSz, nPad, 1); fsAppendData(pFS, pSeg, aSz, sizeof(aSz), &rc); memset(pFS->aBuffer, 0, nPad); fsAppendData(pFS, pSeg, pFS->aBuffer, nPad, &rc); fsAppendData(pFS, pSeg, aSz, sizeof(aSz), &rc); }else if( nPad>0 ){ u8 aBuf[5] = {0,0,0,0,0}; aBuf[0] = (u8)nPad; aBuf[nPad-1] = (u8)nPad; fsAppendData(pFS, pSeg, aBuf, nPad, &rc); } assert( rc!=LSM_OK || pSeg->iLastPg==fsLastPageOnPagesBlock(pFS, pSeg->iLastPg) || ((pSeg->iLastPg + 1) % pFS->szSector)==0 ); } return rc; } /* ** Increment the reference count on the page object passed as the first ** argument. */ void lsmFsPageRef(Page *pPg){ if( pPg ){ |
︙ | ︙ | |||
1462 1463 1464 1465 1466 1467 1468 | }else{ FileSystem *pFS = pDb->pFS; LsmString str; int iBlk; int iLastBlk; iBlk = fsPageToBlock(pFS, pArray->iFirst); | | | | 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 2127 2128 | }else{ FileSystem *pFS = pDb->pFS; LsmString str; int iBlk; int iLastBlk; iBlk = fsPageToBlock(pFS, pArray->iFirst); iLastBlk = fsPageToBlock(pFS, pArray->iLastPg); lsmStringInit(&str, pDb->pEnv); lsmStringAppendf(&str, "%d", pArray->iFirst); while( iBlk!=iLastBlk ){ lsmStringAppendf(&str, " %d", fsLastPageOnBlock(pFS, iBlk)); fsBlockNext(pFS, iBlk, &iBlk); lsmStringAppendf(&str, " %d", fsFirstPageOnBlock(pFS, iBlk)); } lsmStringAppendf(&str, " %d", pArray->iLastPg); *pzOut = str.z; } if( bUnlock ){ int rcwork = LSM_BUSY; lsmFinishWork(pDb, 0, 0, &rcwork); |
︙ | ︙ | |||
1495 1496 1497 1498 1499 1500 1501 | Segment *pSeg, int bExtra, /* If true, count the "next" block if any */ int nUsed, u8 *aUsed ){ if( pSeg ){ if( pSeg && pSeg->nSize>0 ){ | < | | | | 2138 2139 2140 2141 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 | Segment *pSeg, int bExtra, /* If true, count the "next" block if any */ int nUsed, u8 *aUsed ){ if( pSeg ){ if( pSeg && pSeg->nSize>0 ){ Pgno iLast = pSeg->iLastPg; int iBlk; int iLastBlk; iBlk = fsPageToBlock(pFS, pSeg->iFirst); iLastBlk = fsPageToBlock(pFS, pSeg->iLastPg); while( iBlk ){ assert( iBlk<=nUsed ); /* assert( aUsed[iBlk-1]==0 ); */ aUsed[iBlk-1] = 1; if( iBlk!=iLastBlk ){ fsBlockNext(pFS, iBlk, &iBlk); }else{ iBlk = 0; } } if( bExtra && iLast==fsLastPageOnPagesBlock(pFS, iLast) ){ fsBlockNext(pFS, iLastBlk, &iBlk); aUsed[iBlk-1] = 1; } } } } |
︙ | ︙ | |||
1535 1536 1537 1538 1539 1540 1541 1542 1543 1544 | ** This function also checks that there are no references to blocks with ** out-of-range block numbers. ** ** If no errors are found, non-zero is returned. If an error is found, an ** assert() fails. */ int lsmFsIntegrityCheck(lsm_db *pDb){ int i; int j; Freelist freelist = {0, 0, 0}; | > < | | | | 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 | ** This function also checks that there are no references to blocks with ** out-of-range block numbers. ** ** If no errors are found, non-zero is returned. If an error is found, an ** assert() fails. */ int lsmFsIntegrityCheck(lsm_db *pDb){ FileSystem *pFS = pDb->pFS; int i; int j; Freelist freelist = {0, 0, 0}; u8 *aUsed; Level *pLevel; Snapshot *pWorker = pDb->pWorker; int nBlock = pWorker->nBlock; aUsed = lsmMallocZero(pDb->pEnv, nBlock); if( aUsed==0 ){ /* Malloc has failed. Since this function is only called within debug ** builds, this probably means the user is running an OOM injection test. ** Regardless, it will not be possible to run the integrity-check at this ** time, so assume the database is Ok and return non-zero. */ return 1; } for(pLevel=pWorker->pLevel; pLevel; pLevel=pLevel->pNext){ int i; checkBlocks(pFS, &pLevel->lhs, (pLevel->nRight!=0), nBlock, aUsed); for(i=0; i<pLevel->nRight; i++){ |
︙ | ︙ | |||
1584 1585 1586 1587 1588 1589 1590 1591 1592 | } } for(i=0; i<nBlock; i++) assert( aUsed[i]==1 ); lsmFree(pDb->pEnv, aUsed); lsmFree(pDb->pEnv, freelist.aEntry); return 1; } | > > > > > > > > > > > > > > > > > | 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 | } } for(i=0; i<nBlock; i++) assert( aUsed[i]==1 ); lsmFree(pDb->pEnv, aUsed); lsmFree(pDb->pEnv, freelist.aEntry); return 1; } #ifndef NDEBUG /* ** Return true if pPg happens to be the last page in segment pSeg. Or false ** otherwise. This function is only invoked as part of assert() conditions. */ int lsmFsDbPageIsLast(Segment *pSeg, Page *pPg){ if( pPg->pFS->pCompress ){ Pgno iNext = 0; int rc; rc = fsNextPageOffset(pPg->pFS, pSeg, pPg->iPg, pPg->nCompress+6, &iNext); return (rc!=LSM_OK || iNext==0); } return (pPg->iPg==pSeg->iLastPg); } #endif |
Changes to src/lsm_main.c.
︙ | ︙ | |||
319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 | ** in multi-process mode. */ *piVal = lsmDbMultiProc(pDb); }else{ pDb->bMultiProc = *piVal = (*piVal!=0); } break; } default: rc = LSM_MISUSE; break; } va_end(ap); return rc; } void lsmAppendSegmentList(LsmString *pStr, char *zPre, Segment *pSeg){ lsmStringAppendf(pStr, "%s{%d %d %d %d}", zPre, | > > > > > > > > > > > > > > > > > | | 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 | ** in multi-process mode. */ *piVal = lsmDbMultiProc(pDb); }else{ pDb->bMultiProc = *piVal = (*piVal!=0); } break; } case LSM_CONFIG_SET_COMPRESSION: { int *p = va_arg(ap, lsm_compress *); if( pDb->pDatabase ){ /* If lsm_open() has been called, this call is against the rules. */ rc = LSM_MISUSE_BKPT; }else{ memcpy(&pDb->compress, p, sizeof(lsm_compress)); } break; } case LSM_CONFIG_GET_COMPRESSION: { int *p = va_arg(ap, lsm_compress *); memcpy(p, &pDb->compress, sizeof(lsm_compress)); break; } default: rc = LSM_MISUSE; break; } va_end(ap); return rc; } void lsmAppendSegmentList(LsmString *pStr, char *zPre, Segment *pSeg){ lsmStringAppendf(pStr, "%s{%d %d %d %d}", zPre, pSeg->iFirst, pSeg->iLastPg, pSeg->iRoot, pSeg->nSize ); } static int infoGetWorker(lsm_db *pDb, Snapshot **pp, int *pbUnlock){ int rc = LSM_OK; assert( *pbUnlock==0 ); |
︙ | ︙ |
Changes to src/lsm_sorted.c.
︙ | ︙ | |||
25 26 27 28 29 30 31 | ** ** The footer consists of the following values (starting at the end of ** the page and continuing backwards towards the start). All values are ** stored as unsigned big-endian integers. ** ** * Number of records on page (2 bytes). ** * Flags field (2 bytes). | | | 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 | ** ** The footer consists of the following values (starting at the end of ** the page and continuing backwards towards the start). All values are ** stored as unsigned big-endian integers. ** ** * Number of records on page (2 bytes). ** * Flags field (2 bytes). ** * Left-hand pointer value (8 bytes). ** * The starting offset of each record (2 bytes per record). ** ** Records may span pages. Unless it happens to be an exact fit, the part ** of the final record that starts on page X that does not fit on page X ** is stored at the start of page (X+1). This means there may be pages where ** (N==0). And on most pages the first record that starts on the page will ** not start at byte offset 0. For example: |
︙ | ︙ | |||
83 84 85 86 87 88 89 | #define rtIsSystem(eType) (((eType) & LSM_SYSTEMKEY)!=0) /* ** The following macros are used to access a page footer. */ #define SEGMENT_NRECORD_OFFSET(pgsz) ((pgsz) - 2) #define SEGMENT_FLAGS_OFFSET(pgsz) ((pgsz) - 2 - 2) | | | | 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 | #define rtIsSystem(eType) (((eType) & LSM_SYSTEMKEY)!=0) /* ** The following macros are used to access a page footer. */ #define SEGMENT_NRECORD_OFFSET(pgsz) ((pgsz) - 2) #define SEGMENT_FLAGS_OFFSET(pgsz) ((pgsz) - 2 - 2) #define SEGMENT_POINTER_OFFSET(pgsz) ((pgsz) - 2 - 2 - 8) #define SEGMENT_CELLPTR_OFFSET(pgsz, iCell) ((pgsz) - 2 - 2 - 8 - 2 - (iCell)*2) #define SEGMENT_EOF(pgsz, nEntry) SEGMENT_CELLPTR_OFFSET(pgsz, nEntry) #define SEGMENT_BTREE_FLAG 0x0001 #define PGFTR_SKIP_NEXT_FLAG 0x0002 #define PGFTR_SKIP_THIS_FLAG 0x0004 |
︙ | ︙ | |||
118 119 120 121 122 123 124 | Level *pLevel; /* Level object segment is part of */ Segment *pSeg; /* Segment to access */ /* Current page. See segmentPtrLoadPage(). */ Page *pPg; /* Current page */ u16 flags; /* Copy of page flags field */ int nCell; /* Number of cells on pPg */ | | | | 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 | Level *pLevel; /* Level object segment is part of */ Segment *pSeg; /* Segment to access */ /* Current page. See segmentPtrLoadPage(). */ Page *pPg; /* Current page */ u16 flags; /* Copy of page flags field */ int nCell; /* Number of cells on pPg */ Pgno iPtr; /* Base cascade pointer */ /* Current cell. See segmentPtrLoadCell() */ int iCell; /* Current record within page pPg */ int eType; /* Type of current record */ Pgno iPgPtr; /* Cascade pointer offset */ void *pKey; int nKey; /* Key associated with current record */ void *pVal; int nVal; /* Current record value (eType==WRITE only) */ /* Blobs used to allocate buffers for pKey and pVal as required */ Blob blob1; Blob blob2; }; |
︙ | ︙ | |||
263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 | typedef struct Hierarchy Hierarchy; struct Hierarchy { Page **apHier; int nHier; }; struct MergeWorker { lsm_db *pDb; /* Database handle */ Level *pLevel; /* Worker snapshot Level being merged */ MultiCursor *pCsr; /* Cursor to read new segment contents from */ int bFlush; /* True if this is an in-memory tree flush */ Hierarchy hier; /* B-tree hierarchy under construction */ Page *pPage; /* Current output page */ int nWork; /* Number of calls to mergeWorkerNextPage() */ Pgno *aGobble; /* Gobble point for each input segment */ }; #ifdef LSM_DEBUG_EXPENSIVE static int assertPointersOk(lsm_db *, Segment *, Segment *, int); static int assertBtreeOk(lsm_db *, Segment *); static void assertRunInOrder(lsm_db *pDb, Segment *pSeg); #else #define assertRunInOrder(x,y) #endif struct FilePage { u8 *aData; int nData; }; static u8 *fsPageData(Page *pPg, int *pnData){ *pnData = ((struct FilePage *)(pPg))->nData; return ((struct FilePage *)(pPg))->aData; | > > > > > > > > > > > > > > > > > > > | 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 | typedef struct Hierarchy Hierarchy; struct Hierarchy { Page **apHier; int nHier; }; /* ** aSave: ** When mergeWorkerNextPage() is called to advance to the next page in ** the output segment, if the bStore flag for an element of aSave[] is ** true, it is cleared and the corresponding iPgno value is set to the ** page number of the page just completed. ** ** aSave[0] is used to record the pointer value to be pushed into the ** b-tree hierarchy. aSave[1] is used to save the page number of the ** page containing the indirect key most recently written to the b-tree. ** see mergeWorkerPushHierarchy() for details. */ struct MergeWorker { lsm_db *pDb; /* Database handle */ Level *pLevel; /* Worker snapshot Level being merged */ MultiCursor *pCsr; /* Cursor to read new segment contents from */ int bFlush; /* True if this is an in-memory tree flush */ Hierarchy hier; /* B-tree hierarchy under construction */ Page *pPage; /* Current output page */ int nWork; /* Number of calls to mergeWorkerNextPage() */ Pgno *aGobble; /* Gobble point for each input segment */ Pgno iIndirect; struct SavedPgno { Pgno iPgno; int bStore; } aSave[2]; }; #ifdef LSM_DEBUG_EXPENSIVE static int assertPointersOk(lsm_db *, Segment *, Segment *, int); static int assertBtreeOk(lsm_db *, Segment *); static void assertRunInOrder(lsm_db *pDb, Segment *pSeg); #else #define assertRunInOrder(x,y) #define assertBtreeOk(x,y) #endif struct FilePage { u8 *aData; int nData; }; static u8 *fsPageData(Page *pPg, int *pnData){ *pnData = ((struct FilePage *)(pPg))->nData; return ((struct FilePage *)(pPg))->aData; |
︙ | ︙ | |||
317 318 319 320 321 322 323 324 325 326 327 328 329 330 | u32 lsmGetU32(u8 *aOut){ return ((u32)aOut[0] << 24) + ((u32)aOut[1] << 16) + ((u32)aOut[2] << 8) + ((u32)aOut[3]); } static int sortedBlobGrow(lsm_env *pEnv, Blob *pBlob, int nData){ assert( pBlob->pEnv==pEnv || (pBlob->pEnv==0 && pBlob->pData==0) ); if( pBlob->nAlloc<nData ){ pBlob->pData = lsmReallocOrFree(pEnv, pBlob->pData, nData); if( !pBlob->pData ) return LSM_NOMEM; pBlob->nAlloc = nData; | > > > > > > > > > > > > > > > > > > > > > > | 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 | u32 lsmGetU32(u8 *aOut){ return ((u32)aOut[0] << 24) + ((u32)aOut[1] << 16) + ((u32)aOut[2] << 8) + ((u32)aOut[3]); } u32 lsmGetU64(u8 *aOut){ return ((u64)aOut[0] << 56) + ((u64)aOut[1] << 48) + ((u64)aOut[2] << 40) + ((u64)aOut[3] << 32) + ((u64)aOut[4] << 24) + ((u32)aOut[5] << 16) + ((u32)aOut[6] << 8) + ((u32)aOut[7]); } void lsmPutU64(u8 *aOut, u64 nVal){ aOut[0] = (u8)((nVal>>56) & 0xFF); aOut[1] = (u8)((nVal>>48) & 0xFF); aOut[2] = (u8)((nVal>>40) & 0xFF); aOut[3] = (u8)((nVal>>32) & 0xFF); aOut[4] = (u8)((nVal>>24) & 0xFF); aOut[5] = (u8)((nVal>>16) & 0xFF); aOut[6] = (u8)((nVal>> 8) & 0xFF); aOut[7] = (u8)((nVal ) & 0xFF); } static int sortedBlobGrow(lsm_env *pEnv, Blob *pBlob, int nData){ assert( pBlob->pEnv==pEnv || (pBlob->pEnv==0 && pBlob->pData==0) ); if( pBlob->nAlloc<nData ){ pBlob->pData = lsmReallocOrFree(pEnv, pBlob->pData, nData); if( !pBlob->pData ) return LSM_NOMEM; pBlob->nAlloc = nData; |
︙ | ︙ | |||
427 428 429 430 431 432 433 | return rc; } static int pageGetNRec(u8 *aData, int nData){ return (int)lsmGetU16(&aData[SEGMENT_NRECORD_OFFSET(nData)]); } | | | | 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 | return rc; } static int pageGetNRec(u8 *aData, int nData){ return (int)lsmGetU16(&aData[SEGMENT_NRECORD_OFFSET(nData)]); } static Pgno pageGetPtr(u8 *aData, int nData){ return (Pgno)lsmGetU64(&aData[SEGMENT_POINTER_OFFSET(nData)]); } static int pageGetFlags(u8 *aData, int nData){ return (int)lsmGetU16(&aData[SEGMENT_FLAGS_OFFSET(nData)]); } static u8 *pageGetCell(u8 *aData, int nData, int iCell){ |
︙ | ︙ | |||
452 453 454 455 456 457 458 | return pageGetNRec(aData, nData); } /* ** Return the decoded (possibly relative) pointer value stored in cell ** iCell from page aData/nData. */ | | | | | 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 | return pageGetNRec(aData, nData); } /* ** Return the decoded (possibly relative) pointer value stored in cell ** iCell from page aData/nData. */ static Pgno pageGetRecordPtr(u8 *aData, int nData, int iCell){ Pgno iRet; /* Return value */ u8 *aCell; /* Pointer to cell iCell */ assert( iCell<pageGetNRec(aData, nData) && iCell>=0 ); aCell = pageGetCell(aData, nData, iCell); lsmVarintGet64(&aCell[1], &iRet); return iRet; } static u8 *pageGetKey( Page *pPg, /* Page to read from */ int iCell, /* Index of cell on page to read */ int *piTopic, /* OUT: Topic associated with this key */ |
︙ | ︙ | |||
523 524 525 526 527 528 529 | int nData; u8 *aCell; aData = fsPageData(pPg, &nData); aCell = pageGetCell(aData, nData, iKey); assert( aCell[0]==0 ); aCell++; | | | > | | | | 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 | int nData; u8 *aCell; aData = fsPageData(pPg, &nData); aCell = pageGetCell(aData, nData, iKey); assert( aCell[0]==0 ); aCell++; aCell += lsmVarintGet64(aCell, &iRef); lsmVarintGet64(aCell, &iRef); assert( iRef>0 ); return iRef; } #define GETVARINT64(a, i) (((i)=((u8*)(a))[0])<=240?1:lsmVarintGet64((a), &(i))) #define GETVARINT32(a, i) (((i)=((u8*)(a))[0])<=240?1:lsmVarintGet32((a), &(i))) static int pageGetBtreeKey( Page *pPg, int iKey, Pgno *piPtr, int *piTopic, void **ppKey, int *pnKey, Blob *pBlob ){ u8 *aData; int nData; u8 *aCell; int eType; aData = fsPageData(pPg, &nData); assert( SEGMENT_BTREE_FLAG & pageGetFlags(aData, nData) ); assert( iKey>=0 && iKey<pageGetNRec(aData, nData) ); aCell = pageGetCell(aData, nData, iKey); eType = *aCell++; aCell += GETVARINT64(aCell, *piPtr); if( eType==0 ){ int rc; Pgno iRef; /* Page number of referenced page */ Page *pRef; aCell += GETVARINT64(aCell, iRef); rc = lsmFsDbPageGet(lsmPageFS(pPg), iRef, &pRef); if( rc!=LSM_OK ) return rc; pageGetKeyCopy(lsmPageEnv(pPg), pRef, 0, &eType, pBlob); lsmFsPageRelease(pRef); *ppKey = pBlob->pData; *pnKey = pBlob->nData; }else{ |
︙ | ︙ | |||
580 581 582 583 584 585 586 | static int btreeCursorLoadKey(BtreeCursor *pCsr){ int rc = LSM_OK; if( pCsr->iPg<0 ){ pCsr->pKey = 0; pCsr->nKey = 0; pCsr->eType = 0; }else{ | | | 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 | static int btreeCursorLoadKey(BtreeCursor *pCsr){ int rc = LSM_OK; if( pCsr->iPg<0 ){ pCsr->pKey = 0; pCsr->nKey = 0; pCsr->eType = 0; }else{ Pgno dummy; int iPg = pCsr->iPg; int iCell = pCsr->aPg[iPg].iCell; while( iCell<0 && (--iPg)>=0 ){ iCell = pCsr->aPg[iPg].iCell-1; } if( iPg<0 || iCell<0 ) return LSM_CORRUPT_BKPT; |
︙ | ︙ | |||
809 810 811 812 813 814 815 | /* Populate any other aPg[] array entries */ if( rc==LSM_OK && nDepth>1 ){ Blob blob = {0,0,0}; void *pSeek; int nSeek; int iTopicSeek; | < > | 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 | /* Populate any other aPg[] array entries */ if( rc==LSM_OK && nDepth>1 ){ Blob blob = {0,0,0}; void *pSeek; int nSeek; int iTopicSeek; int iPg = 0; int iLoad = pCsr->pSeg->iRoot; Page *pPg = pCsr->aPg[nDepth-1].pPage; if( pageObjGetNRec(pPg)==0 ){ /* This can happen when pPg is the right-most leaf in the b-tree. ** In this case, set the iTopicSeek/pSeek/nSeek key to a value ** greater than any real key. */ assert( iCell==-1 ); iTopicSeek = 1000; pSeek = 0; nSeek = 0; }else{ Pgno dummy; rc = pageGetBtreeKey(pPg, 0, &dummy, &iTopicSeek, &pSeek, &nSeek, &pCsr->blob ); } do { Page *pPg; |
︙ | ︙ | |||
851 852 853 854 855 856 857 | iMax = iCell-1; iMin = 0; while( iMax>=iMin ){ int iTry = (iMin+iMax)/2; void *pKey; int nKey; /* Key for cell iTry */ int iTopic; /* Topic for key pKeyT/nKeyT */ | | | 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 | iMax = iCell-1; iMin = 0; while( iMax>=iMin ){ int iTry = (iMin+iMax)/2; void *pKey; int nKey; /* Key for cell iTry */ int iTopic; /* Topic for key pKeyT/nKeyT */ Pgno iPtr; /* Pointer for cell iTry */ int res; /* (pSeek - pKeyT) */ rc = pageGetBtreeKey(pPg, iTry, &iPtr, &iTopic, &pKey, &nKey,&blob); if( rc!=LSM_OK ) break; res = sortedKeyCompare( xCmp, iTopicSeek, pSeek, nSeek, iTopic, pKey, nKey |
︙ | ︙ | |||
890 891 892 893 894 895 896 | u8 *aData; int nData; pBtreePg = &pCsr->aPg[pCsr->iPg]; aData = fsPageData(pBtreePg->pPage, &nData); pCsr->iPtr = btreeCursorPtr(aData, nData, pBtreePg->iCell+1); if( pBtreePg->iCell<0 ){ | | | 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 | u8 *aData; int nData; pBtreePg = &pCsr->aPg[pCsr->iPg]; aData = fsPageData(pBtreePg->pPage, &nData); pCsr->iPtr = btreeCursorPtr(aData, nData, pBtreePg->iCell+1); if( pBtreePg->iCell<0 ){ Pgno dummy; int i; for(i=pCsr->iPg-1; i>=0; i--){ if( pCsr->aPg[i].iCell>0 ) break; } assert( i>=0 ); rc = pageGetBtreeKey( pCsr->aPg[i].pPage, pCsr->aPg[i].iCell-1, |
︙ | ︙ | |||
1003 1004 1005 1006 1007 1008 1009 | assert( iNew<pPtr->nCell ); pPtr->iCell = iNew; aData = fsPageData(pPtr->pPg, &nPgsz); iOff = lsmGetU16(&aData[SEGMENT_CELLPTR_OFFSET(nPgsz, pPtr->iCell)]); pPtr->eType = aData[iOff]; iOff++; | | | 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 | assert( iNew<pPtr->nCell ); pPtr->iCell = iNew; aData = fsPageData(pPtr->pPg, &nPgsz); iOff = lsmGetU16(&aData[SEGMENT_CELLPTR_OFFSET(nPgsz, pPtr->iCell)]); pPtr->eType = aData[iOff]; iOff++; iOff += GETVARINT64(&aData[iOff], pPtr->iPgPtr); iOff += GETVARINT32(&aData[iOff], pPtr->nKey); if( rtIsWrite(pPtr->eType) ){ iOff += GETVARINT32(&aData[iOff], pPtr->nVal); } assert( pPtr->nKey>=0 ); rc = segmentPtrReadData( |
︙ | ︙ | |||
1045 1046 1047 1048 1049 1050 1051 | u8 *aData; int nData; aData = lsmFsPageData(pPg, &nData); if( pageGetFlags(aData, nData) & SEGMENT_BTREE_FLAG ){ void *pKey; int nKey; | | | 1087 1088 1089 1090 1091 1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 | u8 *aData; int nData; aData = lsmFsPageData(pPg, &nData); if( pageGetFlags(aData, nData) & SEGMENT_BTREE_FLAG ){ void *pKey; int nKey; Pgno dummy; rc = pageGetBtreeKey( pPg, pMerge->splitkey.iCell, &dummy, &iTopic, &pKey, &nKey, &blob ); if( rc==LSM_OK && blob.pData!=pKey ){ rc = sortedBlobSet(pEnv, &blob, pKey, nKey); } }else{ |
︙ | ︙ | |||
1150 1151 1152 1153 1154 1155 1156 | FileSystem *pFS, SegmentPtr *pPtr, int bLast, int *pRc ){ if( *pRc==LSM_OK ){ Page *pNew = 0; | | > > | > | 1192 1193 1194 1195 1196 1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 | FileSystem *pFS, SegmentPtr *pPtr, int bLast, int *pRc ){ if( *pRc==LSM_OK ){ Page *pNew = 0; if( bLast ){ *pRc = lsmFsDbPageLast(pFS, pPtr->pSeg, &pNew); }else{ *pRc = lsmFsDbPageGet(pFS, pPtr->pSeg->iFirst, &pNew); } segmentPtrSetPage(pPtr, pNew); } } /* ** Try to move the segment pointer passed as the second argument so that it |
︙ | ︙ | |||
1530 1531 1532 1533 1534 1535 1536 | int *pbStop ){ int (*xCmp)(void *, int, void *, int) = pCsr->pDb->xCmp; int res; /* Result of comparison operation */ int rc = LSM_OK; int iMin; int iMax; | | | 1575 1576 1577 1578 1579 1580 1581 1582 1583 1584 1585 1586 1587 1588 1589 | int *pbStop ){ int (*xCmp)(void *, int, void *, int) = pCsr->pDb->xCmp; int res; /* Result of comparison operation */ int rc = LSM_OK; int iMin; int iMax; Pgno iPtrOut = 0; const int iTopic = 0; /* If the current page contains an oversized entry, then there are no ** pointers to one or more of the subsequent pages in the sorted run. ** The following call ensures that the segment-ptr points to the correct ** page in this case. */ rc = segmentPtrSearchOversized(pCsr, pPtr, pKey, nKey); |
︙ | ︙ | |||
1554 1555 1556 1557 1558 1559 1560 | */ #if 0 assert( assertKeyLocation(pCsr, pPtr, pKey, nKey) ); #endif assert( pPtr->nCell>0 || pPtr->pSeg->nSize==1 | | | 1599 1600 1601 1602 1603 1604 1605 1606 1607 1608 1609 1610 1611 1612 1613 | */ #if 0 assert( assertKeyLocation(pCsr, pPtr, pKey, nKey) ); #endif assert( pPtr->nCell>0 || pPtr->pSeg->nSize==1 || lsmFsDbPageIsLast(pPtr->pSeg, pPtr->pPg) ); if( pPtr->nCell==0 ){ segmentPtrReset(pPtr); }else{ iMin = 0; iMax = pPtr->nCell-1; |
︙ | ︙ | |||
1698 1699 1700 1701 1702 1703 1704 | iMin = 0; iMax = nRec-1; while( iMax>=iMin ){ int iTry = (iMin+iMax)/2; void *pKeyT; int nKeyT; /* Key for cell iTry */ int iTopicT; /* Topic for key pKeyT/nKeyT */ | | | 1743 1744 1745 1746 1747 1748 1749 1750 1751 1752 1753 1754 1755 1756 1757 | iMin = 0; iMax = nRec-1; while( iMax>=iMin ){ int iTry = (iMin+iMax)/2; void *pKeyT; int nKeyT; /* Key for cell iTry */ int iTopicT; /* Topic for key pKeyT/nKeyT */ Pgno iPtr; /* Pointer associated with cell iTry */ int res; /* (pKey - pKeyT) */ rc = pageGetBtreeKey(pPg, iTry, &iPtr, &iTopicT, &pKeyT, &nKeyT, &blob); if( rc!=LSM_OK ) break; if( piFirst && pKeyT==blob.pData ){ *piFirst = pageGetBtreeRef(pPg, iTry); piFirst = 0; |
︙ | ︙ | |||
2124 2125 2126 2127 2128 2129 2130 2131 2132 2133 2134 2135 2136 2137 2138 2139 2140 2141 2142 2143 2144 2145 2146 | static int multiCursorAddAll(MultiCursor *pCsr, Snapshot *pSnap){ Level *pLvl; int nPtr = 0; int iPtr = 0; int rc = LSM_OK; for(pLvl=pSnap->pLevel; pLvl; pLvl=pLvl->pNext){ nPtr += (1 + pLvl->nRight); } assert( pCsr->aPtr==0 ); pCsr->aPtr = lsmMallocZeroRc(pCsr->pDb->pEnv, sizeof(SegmentPtr) * nPtr, &rc); if( rc==LSM_OK ) pCsr->nPtr = nPtr; for(pLvl=pSnap->pLevel; pLvl && rc==LSM_OK; pLvl=pLvl->pNext){ int i; pCsr->aPtr[iPtr].pLevel = pLvl; pCsr->aPtr[iPtr].pSeg = &pLvl->lhs; iPtr++; for(i=0; i<pLvl->nRight; i++){ pCsr->aPtr[iPtr].pLevel = pLvl; pCsr->aPtr[iPtr].pSeg = &pLvl->aRhs[i]; iPtr++; | > > | 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 | static int multiCursorAddAll(MultiCursor *pCsr, Snapshot *pSnap){ Level *pLvl; int nPtr = 0; int iPtr = 0; int rc = LSM_OK; for(pLvl=pSnap->pLevel; pLvl; pLvl=pLvl->pNext){ if( pLvl->iAge<0 ) continue; nPtr += (1 + pLvl->nRight); } assert( pCsr->aPtr==0 ); pCsr->aPtr = lsmMallocZeroRc(pCsr->pDb->pEnv, sizeof(SegmentPtr) * nPtr, &rc); if( rc==LSM_OK ) pCsr->nPtr = nPtr; for(pLvl=pSnap->pLevel; pLvl && rc==LSM_OK; pLvl=pLvl->pNext){ int i; if( pLvl->iAge<0 ) continue; pCsr->aPtr[iPtr].pLevel = pLvl; pCsr->aPtr[iPtr].pSeg = &pLvl->lhs; iPtr++; for(i=0; i<pLvl->nRight; i++){ pCsr->aPtr[iPtr].pLevel = pLvl; pCsr->aPtr[iPtr].pSeg = &pLvl->aRhs[i]; iPtr++; |
︙ | ︙ | |||
2859 2860 2861 2862 2863 2864 2865 | MergeWorker *pMW, /* Merge worker */ int bSep /* True for separators run */ ){ Segment *pSeg; /* Segment being written */ lsm_db *pDb = pMW->pDb; /* Database handle */ int rc = LSM_OK; /* Return code */ int i; | < < | 2906 2907 2908 2909 2910 2911 2912 2913 2914 2915 2916 2917 2918 2919 2920 2921 2922 | MergeWorker *pMW, /* Merge worker */ int bSep /* True for separators run */ ){ Segment *pSeg; /* Segment being written */ lsm_db *pDb = pMW->pDb; /* Database handle */ int rc = LSM_OK; /* Return code */ int i; Page **apHier = pMW->hier.apHier; int nHier = pMW->hier.nHier; pSeg = &pMW->pLevel->lhs; for(i=0; rc==LSM_OK && i<nHier; i++){ Page *pNew = 0; rc = lsmFsSortedAppend(pDb->pFS, pDb->pWorker, pSeg, &pNew); assert( rc==LSM_OK ); |
︙ | ︙ | |||
2890 2891 2892 2893 2894 2895 2896 | ** since sometimes n1>n2, the page content and footer must be copied ** separately. */ int nEntry = pageGetNRec(a2, n2); int iEof1 = SEGMENT_EOF(n1, nEntry); int iEof2 = SEGMENT_EOF(n2, nEntry); memcpy(a1, a2, iEof2); memcpy(&a1[iEof1], &a2[iEof2], n2 - iEof2); | < < | < < < < | 2935 2936 2937 2938 2939 2940 2941 2942 2943 2944 2945 2946 2947 2948 2949 2950 2951 2952 2953 2954 2955 2956 2957 2958 2959 2960 2961 2962 2963 2964 2965 2966 2967 2968 2969 2970 2971 2972 2973 2974 2975 2976 2977 2978 2979 2980 2981 | ** since sometimes n1>n2, the page content and footer must be copied ** separately. */ int nEntry = pageGetNRec(a2, n2); int iEof1 = SEGMENT_EOF(n1, nEntry); int iEof2 = SEGMENT_EOF(n2, nEntry); memcpy(a1, a2, iEof2); memcpy(&a1[iEof1], &a2[iEof2], n2 - iEof2); lsmFsPageRelease(apHier[i]); apHier[i] = pNew; }else{ lsmPutU16(&a1[SEGMENT_FLAGS_OFFSET(n1)], SEGMENT_BTREE_FLAG); lsmPutU16(&a1[SEGMENT_NRECORD_OFFSET(n1)], 0); lsmPutU64(&a1[SEGMENT_POINTER_OFFSET(n1)], 0); i = i - 1; lsmFsPageRelease(pNew); } } } #ifdef LSM_DEBUG if( rc==LSM_OK ){ for(i=0; i<nHier; i++) assert( lsmFsPageWritable(apHier[i]) ); } #endif return rc; } /* ** Allocate and populate the MergeWorker.apHier[] array. */ static int mergeWorkerLoadHierarchy(MergeWorker *pMW){ int rc = LSM_OK; Segment *pSeg; Hierarchy *p; pSeg = &pMW->pLevel->lhs; p = &pMW->hier; if( p->apHier==0 && pSeg->iRoot!=0 ){ FileSystem *pFS = pMW->pDb->pFS; lsm_env *pEnv = pMW->pDb->pEnv; Page **apHier = 0; int nHier = 0; int iPg = pSeg->iRoot; do { |
︙ | ︙ | |||
2954 2955 2956 2957 2958 2959 2960 | Page **apNew = (Page **)lsmRealloc( pEnv, apHier, sizeof(Page *)*(nHier+1) ); if( apNew==0 ){ rc = LSM_NOMEM_BKPT; break; } | < > > > > > < < < | 2993 2994 2995 2996 2997 2998 2999 3000 3001 3002 3003 3004 3005 3006 3007 3008 3009 3010 3011 3012 3013 3014 3015 3016 3017 3018 3019 3020 3021 3022 3023 3024 3025 3026 3027 3028 3029 3030 3031 3032 3033 3034 3035 3036 3037 3038 3039 | Page **apNew = (Page **)lsmRealloc( pEnv, apHier, sizeof(Page *)*(nHier+1) ); if( apNew==0 ){ rc = LSM_NOMEM_BKPT; break; } apHier = apNew; memmove(&apHier[1], &apHier[0], sizeof(Page *) * nHier); nHier++; apHier[0] = pPg; iPg = pageGetPtr(aData, nData); }else{ lsmFsPageRelease(pPg); break; } }while( 1 ); if( rc==LSM_OK ){ u8 *aData; int nData; aData = fsPageData(apHier[0], &nData); pMW->aSave[0].iPgno = pageGetPtr(aData, nData); p->nHier = nHier; p->apHier = apHier; rc = mergeWorkerMoveHierarchy(pMW, 0); }else{ int i; for(i=0; i<nHier; i++){ lsmFsPageRelease(apHier[i]); } lsmFree(pEnv, apHier); } } return rc; } /* ** B-tree pages use almost the same format as regular pages. The ** differences are: ** ** 1. The record format is (usually, see below) as follows: ** ** + Type byte (always SORTED_SEPARATOR or SORTED_SYSTEM_SEPARATOR), ** + Absolute pointer value (varint), |
︙ | ︙ | |||
3009 3010 3011 3012 3013 3014 3015 | ** ** 4. The pointer in the page footer of a b-tree page points to a page ** that contains keys equal to or larger than the largest key on the ** b-tree page. ** ** The reason for having the page footer pointer point to the right-child ** (instead of the left) is that doing things this way makes the | | > | | < > > | < | | > > | < < < < | < < < < < | < < < < < < < < < | < < < < < < < | < < | < > | > > | | < | | | > | > > > | | > | < < < < < < | < | > > | > | > > > > > > > | > > > > > > > > > > > > > > > > > > > > > | > > > > > > | > > | | > > > > > | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | > | | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | | > | < | < | 3049 3050 3051 3052 3053 3054 3055 3056 3057 3058 3059 3060 3061 3062 3063 3064 3065 3066 3067 3068 3069 3070 3071 3072 3073 3074 3075 3076 3077 3078 3079 3080 3081 3082 3083 3084 3085 3086 3087 3088 3089 3090 3091 3092 3093 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 3129 3130 3131 3132 3133 3134 3135 3136 3137 3138 3139 3140 3141 3142 3143 3144 3145 3146 3147 3148 3149 3150 3151 3152 3153 3154 3155 3156 3157 3158 3159 3160 3161 3162 3163 3164 3165 3166 3167 3168 3169 3170 3171 3172 3173 3174 3175 3176 3177 3178 3179 3180 3181 3182 3183 3184 3185 3186 3187 3188 3189 3190 3191 3192 3193 3194 3195 3196 3197 3198 3199 3200 3201 3202 3203 3204 3205 3206 3207 3208 3209 3210 3211 3212 3213 3214 3215 3216 3217 3218 3219 3220 3221 3222 3223 3224 3225 3226 3227 3228 3229 3230 3231 3232 3233 3234 3235 3236 3237 3238 3239 3240 3241 3242 3243 3244 3245 3246 3247 3248 3249 3250 3251 3252 3253 3254 3255 3256 3257 3258 3259 3260 3261 3262 3263 3264 3265 3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 3279 3280 3281 3282 3283 3284 3285 3286 3287 3288 3289 3290 3291 3292 3293 3294 3295 3296 3297 3298 3299 3300 3301 3302 3303 3304 3305 3306 3307 3308 3309 3310 3311 3312 3313 3314 3315 3316 3317 3318 3319 3320 3321 3322 3323 3324 3325 3326 3327 3328 3329 3330 3331 3332 3333 3334 3335 3336 3337 3338 3339 3340 3341 3342 3343 3344 3345 3346 3347 3348 3349 3350 3351 3352 3353 3354 3355 3356 3357 3358 | ** ** 4. The pointer in the page footer of a b-tree page points to a page ** that contains keys equal to or larger than the largest key on the ** b-tree page. ** ** The reason for having the page footer pointer point to the right-child ** (instead of the left) is that doing things this way makes the ** mergeWorkerMoveHierarchy() operation less complicated (since the pointers ** that need to be updated are all stored as fixed-size integers within the ** page footer, not varints in page records). ** ** Records may not span b-tree pages. If this function is called to add a ** record larger than (page-size / 4) bytes, then a pointer to the indexed ** array page that contains the main record is added to the b-tree instead. ** In this case the record format is: ** ** + 0x00 byte (1 byte) ** + Absolute pointer value (varint), ** + Absolute page number of page containing key (varint). ** ** See function seekInBtree() for the code that traverses b-tree pages. */ static int mergeWorkerBtreeWrite( MergeWorker *pMW, u8 eType, Pgno iPtr, Pgno iKeyPg, void *pKey, int nKey ){ Segment *pSeg = &pMW->pLevel->lhs; Hierarchy *p = &pMW->hier; lsm_db *pDb = pMW->pDb; /* Database handle */ int rc = LSM_OK; /* Return Code */ int iLevel; /* Level of b-tree hierachy to write to */ int nData; /* Size of aData[] in bytes */ u8 *aData; /* Page data for level iLevel */ int iOff; /* Offset on b-tree page to write record to */ int nRec; /* Initial number of records on b-tree page */ /* iKeyPg should be zero for an ordinary b-tree key, or non-zero for an ** indirect key. The flags byte for an indirect key is 0x00. */ assert( (eType==0)==(iKeyPg!=0) ); /* The MergeWorker.apHier[] array contains the right-most leaf of the b-tree ** hierarchy, the root node, and all nodes that lie on the path between. ** apHier[0] is the right-most leaf and apHier[pMW->nHier-1] is the current ** root page. ** ** This loop searches for a node with enough space to store the key on, ** starting with the leaf and iterating up towards the root. When the loop ** exits, the key may be written to apHier[iLevel]. */ for(iLevel=0; iLevel<=p->nHier; iLevel++){ int nByte; /* Number of free bytes required */ if( iLevel==p->nHier ){ /* Extend the array and allocate a new root page. */ Page **aNew; aNew = (Page **)lsmRealloc( pMW->pDb->pEnv, p->apHier, sizeof(Page *)*(p->nHier+1) ); if( !aNew ){ return LSM_NOMEM_BKPT; } p->apHier = aNew; }else{ Page *pOld; int nFree; /* If the key will fit on this page, break out of the loop here. ** The new entry will be written to page apHier[iLevel]. */ pOld = p->apHier[iLevel]; assert( lsmFsPageWritable(pOld) ); aData = fsPageData(pOld, &nData); if( eType==0 ){ nByte = 2 + 1 + lsmVarintLen32(iPtr) + lsmVarintLen32(iKeyPg); }else{ nByte = 2 + 1 + lsmVarintLen32(iPtr) + lsmVarintLen32(nKey) + nKey; } nRec = pageGetNRec(aData, nData); nFree = SEGMENT_EOF(nData, nRec) - mergeWorkerPageOffset(aData, nData); if( nByte<=nFree ) break; /* Otherwise, this page is full. Set the right-hand-child pointer ** to iPtr and release it. */ lsmPutU64(&aData[SEGMENT_POINTER_OFFSET(nData)], iPtr); rc = lsmFsPagePersist(pOld); if( rc==LSM_OK ){ iPtr = lsmFsPageNumber(pOld); lsmFsPageRelease(pOld); } } /* Allocate a new page for apHier[iLevel]. */ p->apHier[iLevel] = 0; if( rc==LSM_OK ){ rc = lsmFsSortedAppend( pDb->pFS, pDb->pWorker, pSeg, &p->apHier[iLevel] ); } if( rc!=LSM_OK ) return rc; aData = fsPageData(p->apHier[iLevel], &nData); memset(aData, 0, nData); lsmPutU16(&aData[SEGMENT_FLAGS_OFFSET(nData)], SEGMENT_BTREE_FLAG); lsmPutU16(&aData[SEGMENT_NRECORD_OFFSET(nData)], 0); if( iLevel==p->nHier ){ p->nHier++; break; } } /* Write the key into page apHier[iLevel]. */ aData = fsPageData(p->apHier[iLevel], &nData); iOff = mergeWorkerPageOffset(aData, nData); nRec = pageGetNRec(aData, nData); lsmPutU16(&aData[SEGMENT_CELLPTR_OFFSET(nData, nRec)], iOff); lsmPutU16(&aData[SEGMENT_NRECORD_OFFSET(nData)], nRec+1); if( eType==0 ){ aData[iOff++] = 0x00; iOff += lsmVarintPut32(&aData[iOff], iPtr); iOff += lsmVarintPut32(&aData[iOff], iKeyPg); }else{ aData[iOff++] = eType; iOff += lsmVarintPut32(&aData[iOff], iPtr); iOff += lsmVarintPut32(&aData[iOff], nKey); memcpy(&aData[iOff], pKey, nKey); } return rc; } static int mergeWorkerBtreeIndirect(MergeWorker *pMW){ int rc = LSM_OK; if( pMW->iIndirect ){ Pgno iKeyPg = pMW->aSave[1].iPgno; rc = mergeWorkerBtreeWrite(pMW, 0, pMW->iIndirect, iKeyPg, 0, 0); pMW->iIndirect = 0; } return rc; } /* ** Append the database key (iTopic/pKey/nKey) to the b-tree under ** construction. This key has not yet been written to a segment page. ** The pointer that will accompany the new key in the b-tree - that ** points to the completed segment page that contains keys smaller than ** (pKey/nKey) is currently stored in pMW->aSave[0].iPgno. */ static int mergeWorkerPushHierarchy( MergeWorker *pMW, /* Merge worker object */ int iTopic, /* Topic value for this key */ void *pKey, /* Pointer to key buffer */ int nKey /* Size of pKey buffer in bytes */ ){ lsm_db *pDb = pMW->pDb; /* Database handle */ int rc = LSM_OK; /* Return Code */ int iLevel; /* Level of b-tree hierachy to write to */ int nData; /* Size of aData[] in bytes */ u8 *aData; /* Page data for level iLevel */ int iOff; /* Offset on b-tree page to write record to */ int nRec; /* Initial number of records on b-tree page */ Pgno iPtr; /* Pointer value to accompany pKey/nKey */ Hierarchy *p; Segment *pSeg; /* If there exists a b-tree hierarchy and it is not loaded into ** memory, load it now. */ pSeg = &pMW->pLevel->lhs; p = &pMW->hier; assert( pMW->aSave[0].bStore==0 ); assert( pMW->aSave[1].bStore==0 ); rc = mergeWorkerBtreeIndirect(pMW); /* Obtain the absolute pointer value to store along with the key in the ** page body. This pointer points to a page that contains keys that are ** smaller than pKey/nKey. */ iPtr = pMW->aSave[0].iPgno; assert( iPtr!=0 ); /* Determine if the indirect format should be used. */ if( (nKey*4 > lsmFsPageSize(pMW->pDb->pFS)) ){ pMW->iIndirect = iPtr; pMW->aSave[1].bStore = 1; }else{ rc = mergeWorkerBtreeWrite( pMW, (u8)(iTopic | LSM_SEPARATOR), iPtr, 0, pKey, nKey ); } /* Ensure that the SortedRun.iRoot field is correct. */ return rc; } static int mergeWorkerFinishHierarchy( MergeWorker *pMW /* Merge worker object */ ){ int i; /* Used to loop through apHier[] */ int rc = LSM_OK; /* Return code */ Pgno iPtr; /* New right-hand-child pointer value */ iPtr = pMW->aSave[0].iPgno; for(i=0; i<pMW->hier.nHier && rc==LSM_OK; i++){ Page *pPg = pMW->hier.apHier[i]; int nData; /* Size of aData[] in bytes */ u8 *aData; /* Page data for pPg */ aData = fsPageData(pPg, &nData); lsmPutU64(&aData[SEGMENT_POINTER_OFFSET(nData)], iPtr); rc = lsmFsPagePersist(pPg); iPtr = lsmFsPageNumber(pPg); lsmFsPageRelease(pPg); } if( pMW->hier.nHier ){ pMW->pLevel->lhs.iRoot = iPtr; lsmFree(pMW->pDb->pEnv, pMW->hier.apHier); pMW->hier.apHier = 0; pMW->hier.nHier = 0; } return rc; } static int mergeWorkerAddPadding( MergeWorker *pMW /* Merge worker object */ ){ FileSystem *pFS = pMW->pDb->pFS; return lsmFsSortedPadding(pFS, pMW->pDb->pWorker, &pMW->pLevel->lhs); } static int keyszToSkip(FileSystem *pFS, int nKey){ int nPgsz; /* Nominal database page size */ nPgsz = lsmFsPageSize(pFS); return LSM_MIN(((nKey * 4) / nPgsz), 3); } /* ** Release the reference to the current output page of merge-worker *pMW ** (reference pMW->pPage). Set the page number values in aSave[] as ** required (see comments above struct MergeWorker for details). */ static int mergeWorkerPersistAndRelease(MergeWorker *pMW){ int rc; int i; assert( pMW->pPage || (pMW->aSave[0].bStore==0 && pMW->aSave[1].bStore==0) ); /* Persist the page */ rc = lsmFsPagePersist(pMW->pPage); /* If required, save the page number. */ for(i=0; i<2; i++){ if( pMW->aSave[i].bStore ){ pMW->aSave[i].iPgno = lsmFsPageNumber(pMW->pPage); pMW->aSave[i].bStore = 0; } } /* Release the completed output page. */ lsmFsPageRelease(pMW->pPage); pMW->pPage = 0; return rc; } /* ** Advance to the next page of an output run being populated by merge-worker ** pMW. The footer of the new page is initialized to indicate that it contains ** zero records. The flags field is cleared. The page footer pointer field ** is set to iFPtr. ** ** If successful, LSM_OK is returned. Otherwise, an error code. */ static int mergeWorkerNextPage( MergeWorker *pMW, /* Merge worker object to append page to */ Pgno iFPtr /* Pointer value for footer of new page */ ){ int rc = LSM_OK; /* Return code */ Page *pNext = 0; /* New page appended to run */ lsm_db *pDb = pMW->pDb; /* Database handle */ Segment *pSeg; /* Run to append to */ pSeg = &pMW->pLevel->lhs; rc = lsmFsSortedAppend(pDb->pFS, pDb->pWorker, pSeg, &pNext); assert( rc!=LSM_OK || pSeg->iFirst>0 || pMW->pDb->compress.xCompress ); if( rc==LSM_OK ){ u8 *aData; /* Data buffer belonging to page pNext */ int nData; /* Size of aData[] in bytes */ rc = mergeWorkerPersistAndRelease(pMW); pMW->pPage = pNext; pMW->pLevel->pMerge->iOutputOff = 0; aData = fsPageData(pNext, &nData); lsmPutU16(&aData[SEGMENT_NRECORD_OFFSET(nData)], 0); lsmPutU16(&aData[SEGMENT_FLAGS_OFFSET(nData)], 0); lsmPutU64(&aData[SEGMENT_POINTER_OFFSET(nData)], iFPtr); pMW->nWork++; } return rc; } /* |
︙ | ︙ | |||
3266 3267 3268 3269 3270 3271 3272 3273 3274 3275 3276 3277 3278 | pMerge->iOutputOff = iOff + nCopy; } } return rc; } static int mergeWorkerWrite( MergeWorker *pMW, /* Merge worker object to write into */ int eType, /* One of SORTED_SEPARATOR, WRITE or DELETE */ void *pKey, int nKey, /* Key value */ MultiCursor *pCsr, /* Read value (if any) from here */ | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | < | > > > > > > | | | < < | < > | | | < | < < < < < < < | | < | < > > | > < < < > | 3396 3397 3398 3399 3400 3401 3402 3403 3404 3405 3406 3407 3408 3409 3410 3411 3412 3413 3414 3415 3416 3417 3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 3432 3433 3434 3435 3436 3437 3438 3439 3440 3441 3442 3443 3444 3445 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 3461 3462 3463 3464 3465 3466 3467 3468 3469 3470 3471 3472 3473 3474 3475 3476 3477 3478 3479 3480 3481 3482 3483 3484 3485 3486 3487 3488 3489 3490 3491 3492 3493 3494 3495 3496 3497 3498 3499 3500 3501 3502 3503 3504 3505 3506 3507 3508 3509 3510 3511 3512 3513 3514 3515 3516 3517 3518 3519 3520 3521 3522 3523 3524 3525 3526 3527 3528 3529 3530 3531 3532 3533 3534 3535 | pMerge->iOutputOff = iOff + nCopy; } } return rc; } /* ** The MergeWorker passed as the only argument is working to merge two or ** more existing segments together (not to flush an in-memory tree). It ** has not yet written the first key to the first page of the output. */ static int mergeWorkerFirstPage(MergeWorker *pMW){ int rc = LSM_OK; /* Return code */ Page *pPg = 0; /* First page of run pSeg */ int iFPtr = 0; /* Pointer value read from footer of pPg */ MultiCursor *pCsr = pMW->pCsr; assert( pMW->pPage==0 ); if( pCsr->pBtCsr ){ rc = LSM_OK; iFPtr = pMW->pLevel->pNext->lhs.iFirst; }else if( pCsr->nPtr>0 ){ Segment *pSeg; pSeg = pCsr->aPtr[pCsr->nPtr-1].pSeg; rc = lsmFsDbPageGet(pMW->pDb->pFS, pSeg->iFirst, &pPg); if( rc==LSM_OK ){ u8 *aData; /* Buffer for page pPg */ int nData; /* Size of aData[] in bytes */ aData = fsPageData(pPg, &nData); iFPtr = pageGetPtr(aData, nData); lsmFsPageRelease(pPg); } } if( rc==LSM_OK ){ rc = mergeWorkerNextPage(pMW, iFPtr); if( pCsr->pPrevMergePtr ) *pCsr->pPrevMergePtr = iFPtr; pMW->aSave[0].bStore = 1; } return rc; } static int mergeWorkerWrite( MergeWorker *pMW, /* Merge worker object to write into */ int eType, /* One of SORTED_SEPARATOR, WRITE or DELETE */ void *pKey, int nKey, /* Key value */ MultiCursor *pCsr, /* Read value (if any) from here */ int iPtr /* Absolute value of page pointer, or 0 */ ){ int rc = LSM_OK; /* Return code */ Merge *pMerge; /* Persistent part of level merge state */ int nHdr; /* Space required for this record header */ Page *pPg; /* Page to write to */ u8 *aData; /* Data buffer for page pWriter->pPage */ int nData; /* Size of buffer aData[] in bytes */ int nRec; /* Number of records on page pPg */ int iFPtr; /* Value of pointer in footer of pPg */ int iRPtr = 0; /* Value of pointer written into record */ int iOff; /* Current write offset within page pPg */ Segment *pSeg; /* Segment being written */ int flags = 0; /* If != 0, flags value for page footer */ int bFirst = 0; /* True for first key of output run */ void *pVal; int nVal; pMerge = pMW->pLevel->pMerge; pSeg = &pMW->pLevel->lhs; if( pSeg->iFirst==0 && pMW->pPage==0 ){ rc = mergeWorkerFirstPage(pMW); bFirst = 1; } pPg = pMW->pPage; if( pPg ){ aData = fsPageData(pPg, &nData); nRec = pageGetNRec(aData, nData); iFPtr = pageGetPtr(aData, nData); iRPtr = iPtr - iFPtr; } /* Figure out how much space is required by the new record. The space ** required is divided into two sections: the header and the body. The ** header consists of the intial varint fields. The body are the blobs ** of data that correspond to the key and value data. The entire header ** must be stored on the page. The body may overflow onto the next and ** subsequent pages. ** ** The header space is: ** ** 1) record type - 1 byte. ** 2) Page-pointer-offset - 1 varint ** 3) Key size - 1 varint ** 4) Value size - 1 varint (only if LSM_INSERT flag is set) */ rc = lsmMCursorValue(pCsr, &pVal, &nVal); if( rc==LSM_OK ){ nHdr = 1 + lsmVarintLen32(iRPtr) + lsmVarintLen32(nKey); if( rtIsWrite(eType) ) nHdr += lsmVarintLen32(nVal); /* If the entire header will not fit on page pPg, or if page pPg is ** marked read-only, advance to the next page of the output run. */ iOff = pMerge->iOutputOff; if( iOff<0 || pPg==0 || iOff+nHdr > SEGMENT_EOF(nData, nRec+1) ){ iFPtr = *pCsr->pPrevMergePtr; iRPtr = iPtr - iFPtr; iOff = 0; nRec = 0; rc = mergeWorkerNextPage(pMW, iFPtr); pPg = pMW->pPage; } } /* If this record header will be the first on the page, and the page is ** not the very first in the entire run, add a copy of the key to the ** b-tree hierarchy. */ if( rc==LSM_OK && nRec==0 && bFirst==0 ){ assert( pMerge->nSkip>=0 ); if( pMerge->nSkip==0 ){ rc = mergeWorkerPushHierarchy(pMW, rtTopic(eType), pKey, nKey); assert( pMW->aSave[0].bStore==0 ); pMW->aSave[0].bStore = 1; pMerge->nSkip = keyszToSkip(pMW->pDb->pFS, nKey); }else{ pMerge->nSkip--; flags = PGFTR_SKIP_THIS_FLAG; } if( pMerge->nSkip ) flags |= PGFTR_SKIP_NEXT_FLAG; } /* Update the output segment */ if( rc==LSM_OK ){ aData = fsPageData(pPg, &nData); |
︙ | ︙ | |||
3418 3419 3420 3421 3422 3423 3424 3425 3426 3427 3428 3429 3430 3431 | /* ** Free all resources allocated by mergeWorkerInit(). */ static void mergeWorkerShutdown(MergeWorker *pMW, int *pRc){ int i; /* Iterator variable */ int rc = *pRc; MultiCursor *pCsr = pMW->pCsr; /* Unless the merge has finished, save the cursor position in the ** Merge.aInput[] array. See function mergeWorkerInit() for the ** code to restore a cursor position based on aInput[]. */ if( rc==LSM_OK && pCsr && lsmMCursorValid(pCsr) ){ Merge *pMerge = pMW->pLevel->pMerge; int bBtree = (pCsr->pBtCsr!=0); | > | 3580 3581 3582 3583 3584 3585 3586 3587 3588 3589 3590 3591 3592 3593 3594 | /* ** Free all resources allocated by mergeWorkerInit(). */ static void mergeWorkerShutdown(MergeWorker *pMW, int *pRc){ int i; /* Iterator variable */ int rc = *pRc; MultiCursor *pCsr = pMW->pCsr; Hierarchy *p = &pMW->hier; /* Unless the merge has finished, save the cursor position in the ** Merge.aInput[] array. See function mergeWorkerInit() for the ** code to restore a cursor position based on aInput[]. */ if( rc==LSM_OK && pCsr && lsmMCursorValid(pCsr) ){ Merge *pMerge = pMW->pLevel->pMerge; int bBtree = (pCsr->pBtCsr!=0); |
︙ | ︙ | |||
3453 3454 3455 3456 3457 3458 3459 | /* Store the location of the split-key */ iPtr = pCsr->aTree[1] - CURSOR_DATA_SEGMENT; if( iPtr<pCsr->nPtr ){ pMerge->splitkey = pMerge->aInput[iPtr]; }else{ btreeCursorSplitkey(pCsr->pBtCsr, &pMerge->splitkey); } | | > | > < < | < < | > > | | | | < < < | < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < | | 3616 3617 3618 3619 3620 3621 3622 3623 3624 3625 3626 3627 3628 3629 3630 3631 3632 3633 3634 3635 3636 3637 3638 3639 3640 3641 3642 3643 3644 3645 3646 3647 3648 | /* Store the location of the split-key */ iPtr = pCsr->aTree[1] - CURSOR_DATA_SEGMENT; if( iPtr<pCsr->nPtr ){ pMerge->splitkey = pMerge->aInput[iPtr]; }else{ btreeCursorSplitkey(pCsr->pBtCsr, &pMerge->splitkey); } pMerge->iOutputOff = -1; } lsmMCursorClose(pCsr); /* Persist and release the output page. */ if( rc==LSM_OK ) rc = mergeWorkerPersistAndRelease(pMW); if( rc==LSM_OK ) rc = mergeWorkerBtreeIndirect(pMW); if( rc==LSM_OK ) rc = mergeWorkerFinishHierarchy(pMW); if( rc==LSM_OK ) rc = mergeWorkerAddPadding(pMW); lsmFree(pMW->pDb->pEnv, pMW->aGobble); pMW->aGobble = 0; pMW->pCsr = 0; pMW->pPage = 0; pMW->pPage = 0; *pRc = rc; } /* ** The cursor passed as the first argument is being used as the input for ** a merge operation. When this function is called, *piFlags contains the ** database entry flags for the current entry. The entry about to be written ** to the output. |
︙ | ︙ | |||
3604 3605 3606 3607 3608 3609 3610 | } } /* If this is a separator key and we know that the output pointer has not ** changed, there is no point in writing an output record. Otherwise, ** proceed. */ if( rtIsSeparator(eType)==0 || iPtr!=0 ){ | < < < < < < | > > > < | < > | 3734 3735 3736 3737 3738 3739 3740 3741 3742 3743 3744 3745 3746 3747 3748 3749 3750 3751 3752 3753 3754 3755 3756 3757 3758 3759 3760 3761 3762 3763 3764 3765 3766 3767 3768 3769 3770 3771 3772 3773 3774 3775 3776 3777 3778 3779 3780 3781 | } } /* If this is a separator key and we know that the output pointer has not ** changed, there is no point in writing an output record. Otherwise, ** proceed. */ if( rtIsSeparator(eType)==0 || iPtr!=0 ){ /* Write the record into the main run. */ if( rc==LSM_OK ){ rc = mergeWorkerWrite(pMW, eType, pKey, nKey, pCsr, iPtr); } } } /* Advance the cursor to the next input record (assuming one exists). */ assert( lsmMCursorValid(pMW->pCsr) ); if( rc==LSM_OK ) rc = lsmMCursorNext(pMW->pCsr); /* If the cursor is at EOF, the merge is finished. Release all page ** references currently held by the merge worker and inform the ** FileSystem object that no further pages will be appended to either ** the main or separators array. */ if( rc==LSM_OK && !lsmMCursorValid(pMW->pCsr) ){ mergeWorkerShutdown(pMW, &rc); if( pSeg->iFirst ){ rc = lsmFsSortedFinish(pDb->pFS, pSeg); } #ifdef LSM_DEBUG_EXPENSIVE if( rc==LSM_OK ){ #if 0 rc = assertBtreeOk(pDb, pSeg); if( pMW->pCsr->pBtCsr ){ Segment *pNext = &pMW->pLevel->pNext->lhs; rc = assertPointersOk(pDb, pSeg, pNext, 0); } #endif } #endif } return rc; } static int mergeWorkerDone(MergeWorker *pMW){ return pMW->pCsr==0 || !lsmMCursorValid(pMW->pCsr); } |
︙ | ︙ | |||
3676 3677 3678 3679 3680 3681 3682 | int *pnWrite /* OUT: Number of database pages written */ ){ int rc = LSM_OK; /* Return Code */ MultiCursor *pCsr = 0; Level *pNext = 0; /* The current top level */ Level *pNew; /* The new level itself */ Segment *pDel = 0; /* Delete separators from this segment */ | < | 3802 3803 3804 3805 3806 3807 3808 3809 3810 3811 3812 3813 3814 3815 | int *pnWrite /* OUT: Number of database pages written */ ){ int rc = LSM_OK; /* Return Code */ MultiCursor *pCsr = 0; Level *pNext = 0; /* The current top level */ Level *pNew; /* The new level itself */ Segment *pDel = 0; /* Delete separators from this segment */ int nWrite = 0; /* Number of database pages written */ assert( pnOvfl ); /* Allocate the new level structure to write to. */ pNext = lsmDbSnapshotLevel(pDb->pWorker); pNew = (Level *)lsmMallocZeroRc(pDb->pEnv, sizeof(Level), &rc); |
︙ | ︙ | |||
3700 3701 3702 3703 3704 3705 3706 | if( pCsr ){ pCsr->pDb = pDb; multiCursorVisitFreelist(pCsr, pnOvfl); rc = multiCursorAddTree(pCsr, pDb->pWorker, eTree); if( rc==LSM_OK && pNext && pNext->pMerge==0 && pNext->lhs.iRoot ){ pDel = &pNext->lhs; rc = btreeCursorNew(pDb, pDel, &pCsr->pBtCsr); | > | > > > > < < < > < < < < > > > > > | 3825 3826 3827 3828 3829 3830 3831 3832 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 3847 3848 3849 3850 3851 3852 3853 3854 3855 3856 3857 3858 3859 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 3888 3889 3890 3891 3892 3893 | if( pCsr ){ pCsr->pDb = pDb; multiCursorVisitFreelist(pCsr, pnOvfl); rc = multiCursorAddTree(pCsr, pDb->pWorker, eTree); if( rc==LSM_OK && pNext && pNext->pMerge==0 && pNext->lhs.iRoot ){ pDel = &pNext->lhs; rc = btreeCursorNew(pDb, pDel, &pCsr->pBtCsr); } if( pNext==0 ){ multiCursorIgnoreDelete(pCsr); } } if( rc!=LSM_OK ){ lsmMCursorClose(pCsr); }else{ Pgno iLeftPtr = 0; Merge merge; /* Merge object used to create new level */ MergeWorker mergeworker; /* MergeWorker object for the same purpose */ memset(&merge, 0, sizeof(Merge)); memset(&mergeworker, 0, sizeof(MergeWorker)); pNew->pMerge = &merge; pNew->iAge = -1; mergeworker.pDb = pDb; mergeworker.pLevel = pNew; mergeworker.pCsr = pCsr; pCsr->pPrevMergePtr = &iLeftPtr; /* Mark the separators array for the new level as a "phantom". */ mergeworker.bFlush = 1; /* Do the work to create the new merged segment on disk */ if( rc==LSM_OK ) rc = lsmMCursorFirst(pCsr); while( rc==LSM_OK && mergeWorkerDone(&mergeworker)==0 ){ rc = mergeWorkerStep(&mergeworker); } nWrite = mergeworker.nWork; mergeWorkerShutdown(&mergeworker, &rc); pNew->pMerge = 0; pNew->iAge = 0; } /* Link the new level into the top of the tree. */ if( rc==LSM_OK ){ if( pDel ) pDel->iRoot = 0; }else{ lsmDbSnapshotSetLevel(pDb->pWorker, pNext); sortedFreeLevel(pDb->pEnv, pNew); } #if 0 lsmSortedDumpStructure(pDb, pDb->pWorker, 1, 0, "new-toplevel"); #endif if( rc==LSM_OK ){ assertBtreeOk(pDb, &pNew->lhs); sortedInvokeWorkHook(pDb); } if( pnWrite ) *pnWrite = nWrite; pDb->pWorker->nWrite += nWrite; return rc; } /* |
︙ | ︙ | |||
3838 3839 3840 3841 3842 3843 3844 | pNew->pMerge = pMerge; } *ppNew = pNew; return rc; } | < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < | 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 | pNew->pMerge = pMerge; } *ppNew = pNew; return rc; } static int mergeWorkerInit( lsm_db *pDb, /* Db connection to do merge work */ Level *pLevel, /* Level to work on merging */ MergeWorker *pMW /* Object to initialize */ ){ int rc = LSM_OK; /* Return code */ Merge *pMerge = pLevel->pMerge; /* Persistent part of merge state */ |
︙ | ︙ | |||
3912 3913 3914 3915 3916 3917 3918 | }else{ multiCursorIgnoreDelete(pCsr); } assert( rc!=LSM_OK || pMerge->nInput==(pCsr->nPtr+(pCsr->pBtCsr!=0)) ); pMW->pCsr = pCsr; | | | > > > | | 4011 4012 4013 4014 4015 4016 4017 4018 4019 4020 4021 4022 4023 4024 4025 4026 4027 4028 4029 4030 4031 4032 4033 4034 | }else{ multiCursorIgnoreDelete(pCsr); } assert( rc!=LSM_OK || pMerge->nInput==(pCsr->nPtr+(pCsr->pBtCsr!=0)) ); pMW->pCsr = pCsr; /* Load the b-tree hierarchy into memory. */ if( rc==LSM_OK ) rc = mergeWorkerLoadHierarchy(pMW); if( rc==LSM_OK && pMW->hier.nHier==0 ){ pMW->aSave[0].iPgno = pLevel->lhs.iFirst; } /* Position the cursor. */ if( rc==LSM_OK ){ pCsr->pPrevMergePtr = &pMerge->iCurrentPtr; if( pLevel->lhs.iFirst==0 ){ /* The output array is still empty. So position the cursor at the very ** start of the input. */ rc = multiCursorEnd(pCsr, 0); }else{ /* The output array is non-empty. Position the cursor based on the ** page/cell data saved in the Merge.aInput[] array. */ int i; |
︙ | ︙ | |||
3955 3956 3957 3958 3959 3960 3961 3962 3963 3964 3965 3966 3967 3968 3969 3970 3971 3972 3973 3974 3975 3976 3977 3978 3979 3980 | } pCsr->flags |= CURSOR_NEXT_OK; } return rc; } static int sortedBtreeGobble( lsm_db *pDb, MultiCursor *pCsr, int iGobble ){ int rc = LSM_OK; if( rtTopic(pCsr->eType)==0 ){ Segment *pSeg = pCsr->aPtr[iGobble].pSeg; Blob *p = &pCsr->key; Pgno *aPg; int nPg; assert( pSeg->iRoot>0 ); aPg = lsmMallocZeroRc(pDb->pEnv, sizeof(Pgno)*32, &rc); if( rc==LSM_OK ){ rc = seekInBtree(pCsr, pSeg, p->pData, p->nData, aPg, 0); } for(nPg=0; aPg[nPg]; nPg++); | > > | | 4057 4058 4059 4060 4061 4062 4063 4064 4065 4066 4067 4068 4069 4070 4071 4072 4073 4074 4075 4076 4077 4078 4079 4080 4081 4082 4083 4084 4085 4086 4087 4088 4089 4090 4091 4092 | } pCsr->flags |= CURSOR_NEXT_OK; } return rc; } /* TODO: Re-enable this!!! */ static int sortedBtreeGobble( lsm_db *pDb, MultiCursor *pCsr, int iGobble ){ int rc = LSM_OK; if( rtTopic(pCsr->eType)==0 ){ Segment *pSeg = pCsr->aPtr[iGobble].pSeg; Blob *p = &pCsr->key; Pgno *aPg; int nPg; assert( pSeg->iRoot>0 ); aPg = lsmMallocZeroRc(pDb->pEnv, sizeof(Pgno)*32, &rc); if( rc==LSM_OK ){ rc = seekInBtree(pCsr, pSeg, p->pData, p->nData, aPg, 0); } for(nPg=0; aPg[nPg]; nPg++); #if 0 lsmFsGobble(pDb, pSeg, aPg, nPg); #endif lsmFree(pDb->pEnv, aPg); } return rc; } |
︙ | ︙ | |||
4175 4176 4177 4178 4179 4180 4181 4182 4183 4184 4185 4186 4187 4188 | ** the database structure has changed. */ mergeWorkerShutdown(&mergeworker, &rc); if( rc==LSM_OK ) sortedInvokeWorkHook(pDb); #if 0 lsmSortedDumpStructure(pDb, pDb->pWorker, 1, 0, "work"); #endif assertRunInOrder(pDb, &pLevel->lhs); /* If bFlush is true and the database is no longer considered "full", ** break out of the loop even if nRemaining is still greater than ** zero. The caller has an in-memory tree to flush to disk. */ if( bFlush && sortedDbIsFull(pDb)==0 ) break; } | > | 4279 4280 4281 4282 4283 4284 4285 4286 4287 4288 4289 4290 4291 4292 4293 | ** the database structure has changed. */ mergeWorkerShutdown(&mergeworker, &rc); if( rc==LSM_OK ) sortedInvokeWorkHook(pDb); #if 0 lsmSortedDumpStructure(pDb, pDb->pWorker, 1, 0, "work"); #endif assertBtreeOk(pDb, &pLevel->lhs); assertRunInOrder(pDb, &pLevel->lhs); /* If bFlush is true and the database is no longer considered "full", ** break out of the loop even if nRemaining is still greater than ** zero. The caller has an in-memory tree to flush to disk. */ if( bFlush && sortedDbIsFull(pDb)==0 ) break; } |
︙ | ︙ | |||
4196 4197 4198 4199 4200 4201 4202 | #endif return rc; } /* ** The database connection passed as the first argument must be a worker ** connection. This function checks if there exists an "old" in-memory tree | | < | > | > > | | | < | | | | | | | | > > > | | 4301 4302 4303 4304 4305 4306 4307 4308 4309 4310 4311 4312 4313 4314 4315 4316 4317 4318 4319 4320 4321 4322 4323 4324 4325 4326 4327 4328 4329 4330 4331 4332 4333 4334 4335 4336 4337 4338 4339 4340 | #endif return rc; } /* ** The database connection passed as the first argument must be a worker ** connection. This function checks if there exists an "old" in-memory tree ** ready to be flushed to disk. If so, true is returned. Otherwise false. ** ** If an error occurs, *pRc is set to an LSM error code before returning. ** It is assumed that *pRc is set to LSM_OK when this function is called. */ static int sortedTreeHasOld(lsm_db *pDb, int *pRc){ int rc = LSM_OK; int bRet = 0; assert( pDb->pWorker ); if( *pRc==LSM_OK ){ if( pDb->nTransOpen==0 ){ rc = lsmTreeLoadHeader(pDb, 0); } if( rc==LSM_OK && pDb->treehdr.iOldShmid && pDb->treehdr.iOldLog!=pDb->pWorker->iLogOff ){ bRet = 1; }else{ bRet = 0; } *pRc = rc; } assert( *pRc==LSM_OK || bRet==0 ); return bRet; } static int doLsmSingleWork( lsm_db *pDb, int bShutdown, int flags, int nPage, /* Number of pages to write to disk */ |
︙ | ︙ | |||
4240 4241 4242 4243 4244 4245 4246 4247 4248 4249 4250 4251 4252 4253 | int bCkpt = 0; int bToplevel = 0; /* Open the worker 'transaction'. It will be closed before this function ** returns. */ assert( pDb->pWorker==0 ); rc = lsmBeginWork(pDb); if( rc!=LSM_OK ) return rc; /* If this connection is doing auto-checkpoints, set nMax (and nRem) so ** that this call stops writing when the auto-checkpoint is due. */ if( bShutdown==0 && pDb->nAutockpt ){ u32 nSync; u32 nUnsync; | > | 4349 4350 4351 4352 4353 4354 4355 4356 4357 4358 4359 4360 4361 4362 4363 | int bCkpt = 0; int bToplevel = 0; /* Open the worker 'transaction'. It will be closed before this function ** returns. */ assert( pDb->pWorker==0 ); rc = lsmBeginWork(pDb); assert( rc!=8 ); if( rc!=LSM_OK ) return rc; /* If this connection is doing auto-checkpoints, set nMax (and nRem) so ** that this call stops writing when the auto-checkpoint is due. */ if( bShutdown==0 && pDb->nAutockpt ){ u32 nSync; u32 nUnsync; |
︙ | ︙ | |||
4261 4262 4263 4264 4265 4266 4267 | nMax = (pDb->nAutockpt/nPgsz) - (nUnsync-nSync); if( nMax<nRem ){ bCkpt = 1; nRem = LSM_MAX(nMax, 0); } } | | < | < < | < | | | | | | | < | | | | | | | | | < | 4371 4372 4373 4374 4375 4376 4377 4378 4379 4380 4381 4382 4383 4384 4385 4386 4387 4388 4389 4390 4391 4392 4393 4394 4395 4396 4397 4398 4399 4400 4401 4402 4403 | nMax = (pDb->nAutockpt/nPgsz) - (nUnsync-nSync); if( nMax<nRem ){ bCkpt = 1; nRem = LSM_MAX(nMax, 0); } } /* If there exists in-memory data ready to be flushed to disk, attempt ** to flush it now. */ if( sortedTreeHasOld(pDb, &rc) ){ if( sortedDbIsFull(pDb) ){ int nPg = 0; rc = sortedWork(pDb, nRem, 0, 1, &nPg); nRem -= nPg; assert( rc!=LSM_OK || nRem<=0 || !sortedDbIsFull(pDb) ); bToplevel = 1; } if( rc==LSM_OK && nRem>0 ){ int nPg = 0; rc = sortedNewToplevel(pDb, TREE_OLD, &nOvfl, &nPg); nRem -= nPg; if( rc==LSM_OK && pDb->nTransOpen>0 ){ lsmTreeDiscardOld(pDb); } bFlush = 1; bToplevel = 0; } } /* If nPage is still greater than zero, do some merging. */ if( rc==LSM_OK && nRem>0 && bShutdown==0 ){ int nPg = 0; int bOptimize = ((flags & LSM_WORK_OPTIMIZE) ? 1 : 0); |
︙ | ︙ | |||
4312 4313 4314 4315 4316 4317 4318 | } if( rc==LSM_OK && lsmCheckpointOverflowRequired(pDb) ){ rc = sortedNewToplevel(pDb, TREE_NONE, &nOvfl, 0); } } if( rc==LSM_OK && (nRem!=nMax) ){ | < | 4416 4417 4418 4419 4420 4421 4422 4423 4424 4425 4426 4427 4428 4429 | } if( rc==LSM_OK && lsmCheckpointOverflowRequired(pDb) ){ rc = sortedNewToplevel(pDb, TREE_NONE, &nOvfl, 0); } } if( rc==LSM_OK && (nRem!=nMax) ){ lsmFinishWork(pDb, bFlush, nOvfl, &rc); }else{ int rcdummy = LSM_BUSY; assert( rc!=LSM_OK || bFlush==0 ); lsmFinishWork(pDb, 0, 0, &rcdummy); } assert( pDb->pWorker==0 ); |
︙ | ︙ | |||
4457 4458 4459 4460 4461 4462 4463 | ** Space for the returned string is allocated using lsmMalloc(), and should ** be freed by the caller using lsmFree(). */ static char *segToString(lsm_env *pEnv, Segment *pSeg, int nMin){ int nSize = pSeg->nSize; Pgno iRoot = pSeg->iRoot; Pgno iFirst = pSeg->iFirst; | | | 4560 4561 4562 4563 4564 4565 4566 4567 4568 4569 4570 4571 4572 4573 4574 | ** Space for the returned string is allocated using lsmMalloc(), and should ** be freed by the caller using lsmFree(). */ static char *segToString(lsm_env *pEnv, Segment *pSeg, int nMin){ int nSize = pSeg->nSize; Pgno iRoot = pSeg->iRoot; Pgno iFirst = pSeg->iFirst; Pgno iLast = pSeg->iLastPg; char *z; char *z1; char *z2; int nPad; z1 = lsmMallocPrintf(pEnv, "%d.%d", iFirst, iLast); |
︙ | ︙ | |||
4540 4541 4542 4543 4544 4545 4546 | aCell = pageGetCell(aData, nData, i); eType = *aCell++; assert( (flags & SEGMENT_BTREE_FLAG) || eType!=0 ); aCell += lsmVarintGet32(aCell, &iPgPtr); if( eType==0 ){ Pgno iRef; /* Page number of referenced page */ | | | 4643 4644 4645 4646 4647 4648 4649 4650 4651 4652 4653 4654 4655 4656 4657 | aCell = pageGetCell(aData, nData, i); eType = *aCell++; assert( (flags & SEGMENT_BTREE_FLAG) || eType!=0 ); aCell += lsmVarintGet32(aCell, &iPgPtr); if( eType==0 ){ Pgno iRef; /* Page number of referenced page */ aCell += lsmVarintGet64(aCell, &iRef); lsmFsDbPageGet(pDb->pFS, iRef, &pRef); aKey = pageGetKey(pRef, 0, &iTopic, &nKey, &blob); }else{ aCell += lsmVarintGet32(aCell, &nKey); if( rtIsWrite(eType) ) aCell += lsmVarintGet32(aCell, &nVal); sortedReadData(pPg, (aCell-aData), nKey+nVal, (void **)&aKey, &blob); aVal = &aKey[nKey]; |
︙ | ︙ | |||
4575 4576 4577 4578 4579 4580 4581 4582 4583 4584 4585 4586 4587 4588 | lsmStringClear(&s); sortedBlobFree(&blob); } static void infoCellDump( lsm_db *pDb, Page *pPg, int iCell, int *peType, int *piPgPtr, u8 **paKey, int *pnKey, u8 **paVal, int *pnVal, Blob *pBlob | > | 4678 4679 4680 4681 4682 4683 4684 4685 4686 4687 4688 4689 4690 4691 4692 | lsmStringClear(&s); sortedBlobFree(&blob); } static void infoCellDump( lsm_db *pDb, int bIndirect, /* True to follow indirect refs */ Page *pPg, int iCell, int *peType, int *piPgPtr, u8 **paKey, int *pnKey, u8 **paVal, int *pnVal, Blob *pBlob |
︙ | ︙ | |||
4600 4601 4602 4603 4604 4605 4606 | aCell = pageGetCell(aData, nData, iCell); eType = *aCell++; aCell += lsmVarintGet32(aCell, &iPgPtr); if( eType==0 ){ int dummy; Pgno iRef; /* Page number of referenced page */ | | > | | | | | > > > > | 4704 4705 4706 4707 4708 4709 4710 4711 4712 4713 4714 4715 4716 4717 4718 4719 4720 4721 4722 4723 4724 4725 4726 4727 4728 | aCell = pageGetCell(aData, nData, iCell); eType = *aCell++; aCell += lsmVarintGet32(aCell, &iPgPtr); if( eType==0 ){ int dummy; Pgno iRef; /* Page number of referenced page */ aCell += lsmVarintGet64(aCell, &iRef); if( bIndirect ){ lsmFsDbPageGet(pDb->pFS, iRef, &pRef); pageGetKeyCopy(pDb->pEnv, pRef, 0, &dummy, pBlob); aKey = (u8 *)pBlob->pData; nKey = pBlob->nData; lsmFsPageRelease(pRef); }else{ aKey = (u8 *)"<indirect>"; nKey = 11; } }else{ aCell += lsmVarintGet32(aCell, &nKey); if( rtIsWrite(eType) ) aCell += lsmVarintGet32(aCell, &nVal); sortedReadData(pPg, (aCell-aData), nKey+nVal, (void **)&aKey, pBlob); aVal = &aKey[nKey]; } |
︙ | ︙ | |||
4633 4634 4635 4636 4637 4638 4639 | }else{ lsmStringAppendf(pStr, "%c", isalnum(z[iChar]) ?z[iChar] : '.'); } } return LSM_OK; } | | | | > > | 4742 4743 4744 4745 4746 4747 4748 4749 4750 4751 4752 4753 4754 4755 4756 4757 4758 4759 4760 4761 4762 4763 4764 4765 4766 4767 4768 4769 4770 4771 4772 4773 4774 4775 | }else{ lsmStringAppendf(pStr, "%c", isalnum(z[iChar]) ?z[iChar] : '.'); } } return LSM_OK; } #define INFO_PAGE_DUMP_DATA 0x01 #define INFO_PAGE_DUMP_VALUES 0x02 #define INFO_PAGE_DUMP_HEX 0x04 #define INFO_PAGE_DUMP_INDIRECT 0x08 static int infoPageDump( lsm_db *pDb, /* Database handle */ Pgno iPg, /* Page number of page to dump */ int flags, char **pzOut /* OUT: lsmMalloc'd string */ ){ int rc = LSM_OK; /* Return code */ Page *pPg = 0; /* Handle for page iPg */ int i, j; /* Loop counters */ const int perLine = 16; /* Bytes per line in the raw hex dump */ int bValues = (flags & INFO_PAGE_DUMP_VALUES); int bHex = (flags & INFO_PAGE_DUMP_HEX); int bData = (flags & INFO_PAGE_DUMP_DATA); int bIndirect = (flags & INFO_PAGE_DUMP_INDIRECT); *pzOut = 0; if( iPg==0 ) return LSM_ERROR; rc = lsmFsDbPageGet(pDb->pFS, iPg, &pPg); if( rc==LSM_OK ){ Blob blob = {0, 0, 0, 0}; |
︙ | ︙ | |||
4680 4681 4682 4683 4684 4685 4686 | lsmStringAppendf(&str, "nRec : %d\n", nRec); lsmStringAppendf(&str, "iPtr : %d\n", iPtr); lsmStringAppendf(&str, "flags: %04x\n", flags); lsmStringAppendf(&str, "\n"); for(iCell=0; iCell<nRec; iCell++){ int nKey; | | | | 4791 4792 4793 4794 4795 4796 4797 4798 4799 4800 4801 4802 4803 4804 4805 4806 4807 4808 4809 4810 4811 4812 4813 4814 4815 4816 4817 4818 4819 | lsmStringAppendf(&str, "nRec : %d\n", nRec); lsmStringAppendf(&str, "iPtr : %d\n", iPtr); lsmStringAppendf(&str, "flags: %04x\n", flags); lsmStringAppendf(&str, "\n"); for(iCell=0; iCell<nRec; iCell++){ int nKey; infoCellDump(pDb, bIndirect, pPg, iCell, 0, 0, 0, &nKey, 0, 0, &blob); if( nKey>nKeyWidth ) nKeyWidth = nKey; } if( bHex ) nKeyWidth = nKeyWidth * 2; for(iCell=0; iCell<nRec; iCell++){ u8 *aKey; int nKey = 0; /* Key */ u8 *aVal; int nVal = 0; /* Value */ int iPgPtr; int eType; char cType = '?'; Pgno iAbsPtr; char zFlags[8]; infoCellDump(pDb, bIndirect, pPg, iCell, &eType, &iPgPtr, &aKey, &nKey, &aVal, &nVal, &blob ); iAbsPtr = iPgPtr + ((flags & SEGMENT_BTREE_FLAG) ? 0 : iPtr); lsmFlagsToString(eType, zFlags); lsmStringAppendf(&str, "%s %d (%s) ", zFlags, iAbsPtr, (rtTopic(eType) ? "sys" : "usr") |
︙ | ︙ | |||
4880 4881 4882 4883 4884 4885 4886 | for(p=pLevel; p; p=pNext){ pNext = p->pNext; sortedFreeLevel(pEnv, p); } } | < < < < < < < < < < < < < < < < | 4991 4992 4993 4994 4995 4996 4997 4998 4999 5000 5001 5002 5003 5004 | for(p=pLevel; p; p=pNext){ pNext = p->pNext; sortedFreeLevel(pEnv, p); } } void lsmSortedSaveTreeCursors(lsm_db *pDb){ MultiCursor *pCsr; for(pCsr=pDb->pCsr; pCsr; pCsr=pCsr->pNext){ lsmTreeCursorSave(pCsr->apTreeCsr[0]); lsmTreeCursorSave(pCsr->apTreeCsr[1]); } } |
︙ | ︙ |
Changes to src/lsm_tree.c.
︙ | ︙ | |||
2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 | TreeKey *pKey = csrGetKey(&csr, &blob, &rc); if( rc!=LSM_OK ) break; assert( ((prev&LSM_START_DELETE)==0)==((pKey->flags&LSM_END_DELETE)==0) ); prev = pKey->flags; } tblobFree(csr.pDb, &csr.blob); return 1; } static int treeCountEntries(lsm_db *db){ TreeCursor csr; /* Cursor used to iterate through tree */ int rc; | > | 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 | TreeKey *pKey = csrGetKey(&csr, &blob, &rc); if( rc!=LSM_OK ) break; assert( ((prev&LSM_START_DELETE)==0)==((pKey->flags&LSM_END_DELETE)==0) ); prev = pKey->flags; } tblobFree(csr.pDb, &csr.blob); tblobFree(csr.pDb, &blob); return 1; } static int treeCountEntries(lsm_db *db){ TreeCursor csr; /* Cursor used to iterate through tree */ int rc; |
︙ | ︙ |
Changes to test/csr1.test.
︙ | ︙ | |||
56 57 58 59 60 61 62 | populate_db do_execsql_test 2.1 { BEGIN; INSERT INTO t1 VALUES(10, 100); } do_test 2.2 { sqlite4 db2 ./test.db | | | | 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 | populate_db do_execsql_test 2.1 { BEGIN; INSERT INTO t1 VALUES(10, 100); } do_test 2.2 { sqlite4 db2 ./test.db list [catch { db2 eval { BEGIN ; INSERT INTO t1 VALUES(1, 2) } } msg] $msg } {1 {database is locked}} do_execsql_test 2.3 { COMMIT } do_test 2.4 { sqlite4_lsm_work db2 main -flush 0 } {0} db2 close #------------------------------------------------------------------------- |
︙ | ︙ |
Changes to test/log1.test.
︙ | ︙ | |||
130 131 132 133 134 135 136 | execsql { SELECT count(*) FROM t1 } db2 } {256} db2 close reset_db do_execsql_test 3.5 { CREATE TABLE t1(a, b) } do_test 3.6 { | | | 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 | execsql { SELECT count(*) FROM t1 } db2 } {256} db2 close reset_db do_execsql_test 3.5 { CREATE TABLE t1(a, b) } do_test 3.6 { sqlite4_lsm_checkpoint db main for {set i 0} {$i < 203} {incr i} { execsql { INSERT INTO t1 VALUES(randstr(100,100), randstr(100,100)) } } execsql { SELECT count(*) FROM t1 } } {203} do_test 3.7 { |
︙ | ︙ | |||
272 273 274 275 276 277 278 | INSERT INTO x VALUES(randstr(10,10), randstr(100,100)); INSERT INTO x VALUES(randstr(10,10), randstr(100,100)); INSERT INTO x VALUES(randstr(10,10), randstr(100,100)); INSERT INTO x VALUES(randstr(10,10), randstr(100,100)); INSERT INTO x VALUES(randstr(10,10), randstr(100,100)); } do_filesize_test 8.2 0 776 | > | | | 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 | INSERT INTO x VALUES(randstr(10,10), randstr(100,100)); INSERT INTO x VALUES(randstr(10,10), randstr(100,100)); INSERT INTO x VALUES(randstr(10,10), randstr(100,100)); INSERT INTO x VALUES(randstr(10,10), randstr(100,100)); INSERT INTO x VALUES(randstr(10,10), randstr(100,100)); } do_filesize_test 8.2 0 776 do_test 8.3.1 { sqlite4_lsm_flush db main } {} do_test 8.3.2 { sqlite4_lsm_work db main } 0 do_execsql_test 8.4 { INSERT INTO x VALUES(randstr(10,10), randstr(100,100)) } do_filesize_test 8.5 12288 915 do_test 8.6 { sqlite4_lsm_checkpoint db main } {} do_test 8.7 { copy_db_files test.db test.db2 sqlite4 db2 test.db2 execsql { SELECT count(*) FROM x ; PRAGMA integrity_check } db2 } {6 ok} |
︙ | ︙ | |||
354 355 356 357 358 359 360 | } do_test 10.7 { sqlite4_lsm_info db main log-structure } {0 0 0 0 0 556} do_test 10.8 { sqlite4_lsm_work db main -flush } 0 do_execsql_test 10.9 { INSERT INTO t1 VALUES(randstr(10,10), randstr(100,100)); } do_test 10.9 { sqlite4_lsm_info db main log-structure } {0 0 0 0 0 695} | | | 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 | } do_test 10.7 { sqlite4_lsm_info db main log-structure } {0 0 0 0 0 556} do_test 10.8 { sqlite4_lsm_work db main -flush } 0 do_execsql_test 10.9 { INSERT INTO t1 VALUES(randstr(10,10), randstr(100,100)); } do_test 10.9 { sqlite4_lsm_info db main log-structure } {0 0 0 0 0 695} do_test 10.10 { sqlite4_lsm_checkpoint db main } {} do_test 10.11 { sqlite4_lsm_info db main log-structure } {0 0 0 0 556 695} #------------------------------------------------------------------------- # reset_db do_test 11.1 { sqlite4_lsm_config db main log-size 800 } 800 do_test 11.2 { sqlite4_lsm_config db main log-size } 800 |
︙ | ︙ | |||
381 382 383 384 385 386 387 | do_test 11.4 { sqlite4_lsm_info db main log-structure } {0 0 0 0 0 1335} do_test 11.5 { sqlite4_lsm_work db main -flush } 0 do_execsql_test 11.6 { INSERT INTO t1 VALUES(randstr(10,10), randstr(100,100)); } do_test 11.7 { sqlite4_lsm_info db main log-structure } {0 0 0 0 0 1474} | | | 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 | do_test 11.4 { sqlite4_lsm_info db main log-structure } {0 0 0 0 0 1335} do_test 11.5 { sqlite4_lsm_work db main -flush } 0 do_execsql_test 11.6 { INSERT INTO t1 VALUES(randstr(10,10), randstr(100,100)); } do_test 11.7 { sqlite4_lsm_info db main log-structure } {0 0 0 0 0 1474} do_test 11.8 { sqlite4_lsm_checkpoint db main } {} do_test 11.9 { sqlite4_lsm_info db main log-structure } {0 0 0 0 1335 1474} do_execsql_test 11.10 { INSERT INTO t1 VALUES(randstr(10,10), randstr(100,100)); } do_test 11.11 { sqlite4_lsm_info db main log-structure } {1335 1482 0 0 0 139} do_test 11.12 { execsql { SELECT count(*) FROM t1 ; PRAGMA integrity_check } |
︙ | ︙ | |||
441 442 443 444 445 446 447 | db eval {SELECT randstr(5,5)} do_execsql_test 11.22 { INSERT INTO t1 VALUES(randstr(10,10), randstr(100,100)); } do_test 11.23 { sqlite4_lsm_info db main log-structure } {1335 1482 0 1259 1483 1908} | | | | 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 | db eval {SELECT randstr(5,5)} do_execsql_test 11.22 { INSERT INTO t1 VALUES(randstr(10,10), randstr(100,100)); } do_test 11.23 { sqlite4_lsm_info db main log-structure } {1335 1482 0 1259 1483 1908} do_test 11.24 { sqlite4_lsm_checkpoint db main } {} do_test 11.25 { sqlite4_lsm_info db main log-structure } {0 0 0 0 1769 1908} #------------------------------------------------------------------------- # reset_db do_test 12.1 { sqlite4_lsm_config db main log-size 800 } 800 do_execsql_test 12.2 { CREATE TABLE t1(a PRIMARY KEY, b); CREATE INDEX i1 ON t1(b); } for {set iTest 1} {$iTest<=150} {incr iTest} { expr srand(0) do_test 12.3.$iTest { for {set i 0} {$i < 10} {incr i} { execsql { INSERT INTO t1 VALUES(randstr(20,20), randstr(100,100)) } if { int(rand()*10.0)==0 } { sqlite4_lsm_work db main -flush } if { int(rand()*10.0)==0 } { sqlite4_lsm_checkpoint db main } } copy_db_files test.db test.db2 sqlite4 db2 test.db2 set sql "SELECT count(*) FROM t1 ; " if {0==($iTest % 25)} { append sql "PRAGMA integrity_check" } else { |
︙ | ︙ |
Changes to test/log3.test.
︙ | ︙ | |||
52 53 54 55 56 57 58 | INSERT INTO t1 VALUES(randstr(50,50), randstr(50,50)); INSERT INTO t1 VALUES(randstr(50,50), randstr(50,50)); INSERT INTO t1 VALUES(randstr(50,50), randstr(50,50)); COMMIT; } {} do_filesize_test 2.5 0 2048 | | | | | | 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 | INSERT INTO t1 VALUES(randstr(50,50), randstr(50,50)); INSERT INTO t1 VALUES(randstr(50,50), randstr(50,50)); INSERT INTO t1 VALUES(randstr(50,50), randstr(50,50)); COMMIT; } {} do_filesize_test 2.5 0 2048 do_test 2.6 { sqlite4_lsm_flush db main } {} do_execsql_test 2.7 { INSERT INTO t1 VALUES(randstr(50,50), randstr(50,50)) } do_test 2.8 { sqlite4_lsm_checkpoint db main } {} do_test 2.9 { sqlite4_lsm_info db main log-structure } {0 0 0 0 2560 3072} for {set i 1} {$i <= 6} {incr i} { do_execsql_test 2.10.$i.1 { INSERT INTO t1 VALUES(randstr(50,50), randstr(50,50)); } do_execsql_test 2.10.$i.2 { SELECT count(*) FROM t1 } [expr 8 + $i] do_recover_test 2.10.$i.3 { SELECT count(*) FROM t1 } [expr 8 + $i] } do_test 2.11 { sqlite4_lsm_info db main log-structure } {2560 3080 0 2216 3584 4608} finish_test |
Changes to test/permutations.test.
︙ | ︙ | |||
130 131 132 133 134 135 136 | # full # lappend ::testsuitelist xxx test_suite "src4" -prefix "" -description { } -files { simple.test simple2.test | | | 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 | # full # lappend ::testsuitelist xxx test_suite "src4" -prefix "" -description { } -files { simple.test simple2.test log3.test csr1.test ckpt1.test mc1.test aggerror.test attach.test autoindex1.test |
︙ | ︙ |
Changes to test/simple.test.
︙ | ︙ | |||
1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 | do_catchsql_test 70.3 { select * from maintable, joinme INDEXED by joinme_id_text_idx } {1 {cannot use index: joinme_id_text_idx}} #------------------------------------------------------------------------- # This is testing that the "phantom" runs feature works. reset_db do_execsql_test 71.1 { CREATE TABLE t1(x); INSERT INTO t1 VALUES(randomblob(1024)); -- 1 INSERT INTO t1 SELECT randomblob(1024) FROM t1; -- 2 INSERT INTO t1 SELECT randomblob(1024) FROM t1; -- 4 INSERT INTO t1 SELECT randomblob(1024) FROM t1; -- 8 | > > > | 1365 1366 1367 1368 1369 1370 1371 1372 1373 1374 1375 1376 1377 1378 1379 1380 1381 | do_catchsql_test 70.3 { select * from maintable, joinme INDEXED by joinme_id_text_idx } {1 {cannot use index: joinme_id_text_idx}} #------------------------------------------------------------------------- # This is testing that the "phantom" runs feature works. # # UPDATE: Said feature was dropped early in development. But the test # remains valid. reset_db do_execsql_test 71.1 { CREATE TABLE t1(x); INSERT INTO t1 VALUES(randomblob(1024)); -- 1 INSERT INTO t1 SELECT randomblob(1024) FROM t1; -- 2 INSERT INTO t1 SELECT randomblob(1024) FROM t1; -- 4 INSERT INTO t1 SELECT randomblob(1024) FROM t1; -- 8 |
︙ | ︙ | |||
1386 1387 1388 1389 1390 1391 1392 1393 1394 | do_execsql_test 71.3 { SELECT count(*) FROM t1 } 64 do_test 71.4 { expr {[file size test.db] < 256*1024} } {1} #------------------------------------------------------------------------- # This is testing that the "phantom" runs feature works with mmap. reset_db | > > > | | | | 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 | do_execsql_test 71.3 { SELECT count(*) FROM t1 } 64 do_test 71.4 { expr {[file size test.db] < 256*1024} } {1} #------------------------------------------------------------------------- # This is testing that the "phantom" runs feature works with mmap. # # UPDATE: Said feature was dropped early in development. But the test # remains valid. reset_db #do_test 72.0.1 { sqlite4_lsm_config db main mmap } 0 #do_test 72.0.2 { sqlite4_lsm_config db main mmap 1 } 1 #do_test 72.0.3 { sqlite4_lsm_config db main mmap } 1 do_execsql_test 72.1 { CREATE TABLE t1(x); INSERT INTO t1 VALUES(randomblob(1024)); -- 1 INSERT INTO t1 SELECT randomblob(1024) FROM t1; -- 2 INSERT INTO t1 SELECT randomblob(1024) FROM t1; -- 4 INSERT INTO t1 SELECT randomblob(1024) FROM t1; -- 8 |
︙ | ︙ |
Changes to test/test_lsm.c.
︙ | ︙ | |||
150 151 152 153 154 155 156 | Tcl_Obj *CONST objv[] ){ struct Switch { const char *zSwitch; int flags; } aSwitch[] = { { "-flush", LSM_WORK_FLUSH }, | < | 150 151 152 153 154 155 156 157 158 159 160 161 162 163 | Tcl_Obj *CONST objv[] ){ struct Switch { const char *zSwitch; int flags; } aSwitch[] = { { "-flush", LSM_WORK_FLUSH }, { "-optimize", LSM_WORK_OPTIMIZE }, { 0, 0 } }; int flags = 0; int nPage = 0; const char *zDb; |
︙ | ︙ | |||
203 204 205 206 207 208 209 210 211 212 213 214 215 | Tcl_SetResult(interp, (char *)sqlite4TestErrorName(rc), TCL_STATIC); return TCL_ERROR; } Tcl_SetObjResult(interp, Tcl_NewIntObj(nWork)); return TCL_OK; } int SqlitetestLsm_Init(Tcl_Interp *interp){ struct SyscallCmd { const char *zName; Tcl_ObjCmdProc *xCmd; } aCmd[] = { | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | > > | | | 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 | Tcl_SetResult(interp, (char *)sqlite4TestErrorName(rc), TCL_STATIC); return TCL_ERROR; } Tcl_SetObjResult(interp, Tcl_NewIntObj(nWork)); return TCL_OK; } /* ** TCLCMD: sqlite4_lsm_checkpoint DB DBNAME */ static int test_lsm_checkpoint( void * clientData, Tcl_Interp *interp, int objc, Tcl_Obj *CONST objv[] ){ const char *zDb; const char *zName; int rc; sqlite4 *db; lsm_db *pLsm; if( objc!=3 ){ Tcl_WrongNumArgs(interp, 1, objv, "DB DBNAME"); return TCL_ERROR; } zDb = Tcl_GetString(objv[1]); zName = Tcl_GetString(objv[2]); rc = getDbPointer(interp, zDb, &db); if( rc!=TCL_OK ) return rc; rc = sqlite4_kvstore_control(db, zName, SQLITE4_KVCTRL_LSM_HANDLE, &pLsm); if( rc==SQLITE4_OK ){ rc = lsm_checkpoint(pLsm, 0); } if( rc!=SQLITE4_OK ){ Tcl_SetResult(interp, (char *)sqlite4TestErrorName(rc), TCL_STATIC); return TCL_ERROR; } Tcl_ResetResult(interp); return TCL_OK; } /* ** TCLCMD: sqlite4_lsm_flush DB DBNAME */ static int test_lsm_flush( void * clientData, Tcl_Interp *interp, int objc, Tcl_Obj *CONST objv[] ){ const char *zDb; const char *zName; int rc; sqlite4 *db; lsm_db *pLsm; if( objc!=3 ){ Tcl_WrongNumArgs(interp, 1, objv, "DB DBNAME"); return TCL_ERROR; } zDb = Tcl_GetString(objv[1]); zName = Tcl_GetString(objv[2]); rc = getDbPointer(interp, zDb, &db); if( rc!=TCL_OK ) return rc; rc = sqlite4_kvstore_control(db, zName, SQLITE4_KVCTRL_LSM_HANDLE, &pLsm); if( rc==SQLITE4_OK ){ int nZero = 0; int nOrig = -1; lsm_config(pLsm, LSM_CONFIG_WRITE_BUFFER, &nOrig); lsm_config(pLsm, LSM_CONFIG_WRITE_BUFFER, &nZero); rc = lsm_begin(pLsm, 1); if( rc==LSM_OK ) rc = lsm_commit(pLsm, 0); lsm_config(pLsm, LSM_CONFIG_WRITE_BUFFER, &nOrig); } if( rc!=SQLITE4_OK ){ Tcl_SetResult(interp, (char *)sqlite4TestErrorName(rc), TCL_STATIC); return TCL_ERROR; } Tcl_ResetResult(interp); return TCL_OK; } int SqlitetestLsm_Init(Tcl_Interp *interp){ struct SyscallCmd { const char *zName; Tcl_ObjCmdProc *xCmd; } aCmd[] = { { "sqlite4_lsm_work", test_lsm_work }, { "sqlite4_lsm_checkpoint", test_lsm_checkpoint }, { "sqlite4_lsm_flush", test_lsm_flush }, { "sqlite4_lsm_info", test_lsm_info }, { "sqlite4_lsm_config", test_lsm_config }, }; int i; for(i=0; i<sizeof(aCmd)/sizeof(aCmd[0]); i++){ Tcl_CreateObjCommand(interp, aCmd[i].zName, aCmd[i].xCmd, 0, 0); } return TCL_OK; } |
Changes to test/tester.tcl.
︙ | ︙ | |||
1516 1517 1518 1519 1520 1521 1522 | # Flush the in-memory tree to disk and merge all runs together into # a single b-tree structure. Because this annihilates all delete keys, # the next rowid allocated for each table with an IPK will be as expected # by SQLite 3 tests. # proc optimize_db {} { | > > | > > | 1516 1517 1518 1519 1520 1521 1522 1523 1524 1525 1526 1527 1528 1529 1530 1531 1532 1533 1534 | # Flush the in-memory tree to disk and merge all runs together into # a single b-tree structure. Because this annihilates all delete keys, # the next rowid allocated for each table with an IPK will be as expected # by SQLite 3 tests. # proc optimize_db {} { #catch { sqlite4_lsm_flush db main sqlite4_lsm_work db main -opt -flush 100000 sqlite4_lsm_checkpoint db main #} return "" } # If the library is compiled with the SQLITE4_DEFAULT_AUTOVACUUM macro set # to non-zero, then set the global variable $AUTOVACUUM to 1. set AUTOVACUUM $sqlite_options(default_autovacuum) |
︙ | ︙ |
Changes to tool/lsmperf.tcl.
︙ | ︙ | |||
187 188 189 190 191 192 193 | append script $data4 append script "pause -1\n" exec_gnuplot_script $script $zPng } do_write_test x.png 600 50000 50000 20 { | | | | 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 | append script $data4 append script "pause -1\n" exec_gnuplot_script $script $zPng } do_write_test x.png 600 50000 50000 20 { lsm-st "mmap=1 multi_proc=0 safety=1 threads=1 autowork=1" lsm-st2 "page_size=1024 mmap=1 multi_proc=0 safety=1 threads=1 autowork=1" } # lsm-mt "mmap=1 multi_proc=0 threads=2 autowork=0 autocheckpoint=8192000" # lsm-mt "mmap=1 multi_proc=0 safety=1 threads=3 autowork=0" # lsm-st "mmap=1 multi_proc=0 safety=1 threads=1 autowork=1" # lsm-mt "mmap=1 multi_proc=0 safety=1 threads=3 autowork=0" # lsm-mt "mmap=1 multi_proc=0 safety=1 threads=3 autowork=0" |
︙ | ︙ |