Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Overview
Comment: | Change the log file format to include a small (12 byte) header at the start of the file. |
---|---|
Downloads: | Tarball | ZIP archive |
Timelines: | family | ancestors | descendants | both | wal |
Files: | files | file ages | folders |
SHA1: |
9865d14d6041874bc1239ce7a061d5c7 |
User & Date: | dan 2010-04-16 13:59:31.000 |
Context
2010-04-17
| ||
12:31 | Enhancements to wal-mode locking scheme. (check-in: 8549c28649 user: dan tags: wal) | |
2010-04-16
| ||
13:59 | Change the log file format to include a small (12 byte) header at the start of the file. (check-in: 9865d14d60 user: dan tags: wal) | |
11:30 | Fix bug in log recovery (last frame in log was being ignored). Also remove an incorrect assert statement. (check-in: 67d2a89ec2 user: dan tags: wal) | |
Changes
Changes to src/log.c.
1 2 3 4 5 6 7 8 9 10 11 12 | /* ** This file contains the implementation of a log file used in ** "journal_mode=wal" mode. */ #include "log.h" #include <unistd.h> #include <fcntl.h> #include <sys/mman.h> | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 | /* ** This file contains the implementation of a log file used in ** "journal_mode=wal" mode. */ /* ** LOG FILE FORMAT ** ** A log file consists of a header followed by zero or more log frames. ** The log header is 12 bytes in size and consists of the following three ** big-endian 32-bit unsigned integer values: ** ** 0: Database page size, ** 4: Randomly selected salt value 1, ** 8: Randomly selected salt value 2. ** ** Immediately following the log header are zero or more log frames. Each ** frame itself consists of a 16-byte header followed by a <page-size> bytes ** of page data. The header is broken into 4 big-endian 32-bit unsigned ** integer values, as follows: ** ** 0: Page number. ** 4: For commit records, the size of the database image in pages ** after the commit. For all other records, zero. ** 8: Checksum value 1. ** 12: Checksum value 2. */ /* ** LOG SUMMARY FORMAT ** ** TODO. */ #include "log.h" #include <unistd.h> #include <fcntl.h> #include <sys/mman.h> |
︙ | ︙ | |||
34 35 36 37 38 39 40 41 42 | }; /* Size of serialized LogSummaryHdr object. */ #define LOGSUMMARY_HDR_NFIELD (sizeof(LogSummaryHdr) / sizeof(u32)) #define LOGSUMMARY_FRAME_OFFSET \ (LOGSUMMARY_HDR_NFIELD + LOG_CKSM_BYTES/sizeof(u32)) /* Size of frame header */ | > > | > > > > > > > > > > | 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 | }; /* Size of serialized LogSummaryHdr object. */ #define LOGSUMMARY_HDR_NFIELD (sizeof(LogSummaryHdr) / sizeof(u32)) #define LOGSUMMARY_FRAME_OFFSET \ (LOGSUMMARY_HDR_NFIELD + LOG_CKSM_BYTES/sizeof(u32)) /* Size of frame header */ #define LOG_FRAME_HDRSIZE 16 #define LOG_HDRSIZE 12 /* ** Return the offset of frame iFrame in the log file, assuming a database ** page size of pgsz bytes. The offset returned is to the start of the ** log frame-header. */ #define logFrameOffset(iFrame, pgsz) ( \ LOG_HDRSIZE + ((iFrame)-1)*((pgsz)+LOG_FRAME_HDRSIZE) \ ) /* ** There is one instance of this structure for each log-summary object ** that this process has a connection to. They are stored in a linked ** list starting at pLogSummary (global variable). ** ** TODO: LogSummary.fd is a unix file descriptor. Unix APIs are used |
︙ | ︙ | |||
120 121 122 123 124 125 126 127 128 129 130 131 132 133 | int nFinal; /* Elements in segment nSegment-1 */ struct LogSegment { int iNext; /* Next aIndex index */ u8 *aIndex; /* Pointer to index array */ u32 *aDbPage; /* Pointer to db page array */ } aSegment[1]; }; /* ** List of all LogSummary objects created by this process. Protected by ** static mutex LOG_SUMMARY_MUTEX. TODO: Should have a dedicated mutex ** here instead of borrowing the LRU mutex. */ | > | 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 | int nFinal; /* Elements in segment nSegment-1 */ struct LogSegment { int iNext; /* Next aIndex index */ u8 *aIndex; /* Pointer to index array */ u32 *aDbPage; /* Pointer to db page array */ } aSegment[1]; }; /* ** List of all LogSummary objects created by this process. Protected by ** static mutex LOG_SUMMARY_MUTEX. TODO: Should have a dedicated mutex ** here instead of borrowing the LRU mutex. */ |
︙ | ︙ | |||
287 288 289 290 291 292 293 | if( pSummary->fd>0 ){ close(pSummary->fd); pSummary->fd = -1; } return rc; } | < | 329 330 331 332 333 334 335 336 337 338 339 340 341 342 | if( pSummary->fd>0 ){ close(pSummary->fd); pSummary->fd = -1; } return rc; } static void logSummaryWriteHdr(LogSummary *pSummary, LogSummaryHdr *pHdr){ u32 *aData = pSummary->aData; memcpy(aData, pHdr, sizeof(LogSummaryHdr)); aData[LOGSUMMARY_HDR_NFIELD] = 1; aData[LOGSUMMARY_HDR_NFIELD+1] = 1; logChecksumBytes( (u8 *)aData, sizeof(LogSummaryHdr), &aData[LOGSUMMARY_HDR_NFIELD] |
︙ | ︙ | |||
317 318 319 320 321 322 323 | u32 *aCksum, /* IN/OUT: Checksum values */ u32 iPage, /* Database page number for frame */ u32 nTruncate, /* New db size (or 0 for non-commit frames) */ int nData, /* Database page size (size of aData[]) */ u8 *aData, /* Pointer to page data (for checksum) */ u8 *aFrame /* OUT: Write encoded frame here */ ){ | | | | < | | | | | | | | | | 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 | u32 *aCksum, /* IN/OUT: Checksum values */ u32 iPage, /* Database page number for frame */ u32 nTruncate, /* New db size (or 0 for non-commit frames) */ int nData, /* Database page size (size of aData[]) */ u8 *aData, /* Pointer to page data (for checksum) */ u8 *aFrame /* OUT: Write encoded frame here */ ){ assert( LOG_FRAME_HDRSIZE==16 ); sqlite3Put4byte(&aFrame[0], iPage); sqlite3Put4byte(&aFrame[4], nTruncate); logChecksumBytes(aFrame, 8, aCksum); logChecksumBytes(aData, nData, aCksum); sqlite3Put4byte(&aFrame[8], aCksum[0]); sqlite3Put4byte(&aFrame[12], aCksum[1]); } /* ** Return 1 and populate *piPage, *pnTruncate and aCksum if the ** frame checksum looks Ok. Otherwise return 0. */ static int logDecodeFrame( u32 *aCksum, /* IN/OUT: Checksum values */ u32 *piPage, /* OUT: Database page number for frame */ u32 *pnTruncate, /* OUT: New db size (or 0 if not commit) */ int nData, /* Database page size (size of aData[]) */ u8 *aData, /* Pointer to page data (for checksum) */ u8 *aFrame /* Frame data */ ){ assert( LOG_FRAME_HDRSIZE==16 ); logChecksumBytes(aFrame, 8, aCksum); logChecksumBytes(aData, nData, aCksum); if( aCksum[0]!=sqlite3Get4byte(&aFrame[8]) || aCksum[1]!=sqlite3Get4byte(&aFrame[12]) ){ /* Checksum failed. */ return 0; } *piPage = sqlite3Get4byte(&aFrame[0]); *pnTruncate = sqlite3Get4byte(&aFrame[4]); return 1; } static void logMergesort8( Pgno *aContent, /* Pages in log */ u8 *aBuffer, /* Buffer of at least *pnList items to use */ u8 *aList, /* IN/OUT: List to sort */ |
︙ | ︙ | |||
482 483 484 485 486 487 488 | u8 aBuf[LOG_FRAME_HDRSIZE]; /* Buffer to load first frame header into */ u8 *aFrame = 0; /* Malloc'd buffer to load entire frame */ int nFrame; /* Number of bytes at aFrame */ u8 *aData; /* Pointer to data part of aFrame buffer */ int iFrame; /* Index of last frame read */ i64 iOffset; /* Next offset to read from log file */ int nPgsz; /* Page size according to the log */ | | | > > < | | 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 | u8 aBuf[LOG_FRAME_HDRSIZE]; /* Buffer to load first frame header into */ u8 *aFrame = 0; /* Malloc'd buffer to load entire frame */ int nFrame; /* Number of bytes at aFrame */ u8 *aData; /* Pointer to data part of aFrame buffer */ int iFrame; /* Index of last frame read */ i64 iOffset; /* Next offset to read from log file */ int nPgsz; /* Page size according to the log */ u32 aCksum[2]; /* Running checksum */ /* Read in the first frame header in the file (to determine the ** database page size). */ rc = sqlite3OsRead(pFd, aBuf, LOG_HDRSIZE, 0); if( rc!=SQLITE_OK ){ return rc; } /* If the database page size is not a power of two, or is greater than ** SQLITE_MAX_PAGE_SIZE, conclude that the log file contains no valid data. */ nPgsz = sqlite3Get4byte(&aBuf[0]); if( nPgsz&(nPgsz-1) || nPgsz>SQLITE_MAX_PAGE_SIZE ){ goto finished; } aCksum[0] = sqlite3Get4byte(&aBuf[4]); aCksum[1] = sqlite3Get4byte(&aBuf[8]); /* Malloc a buffer to read frames into. */ nFrame = nPgsz + LOG_FRAME_HDRSIZE; aFrame = (u8 *)sqlite3_malloc(nFrame); if( !aFrame ){ return SQLITE_NOMEM; } aData = &aFrame[LOG_FRAME_HDRSIZE]; /* Read all frames from the log file. */ iFrame = 0; for(iOffset=LOG_HDRSIZE; (iOffset+nFrame)<=nSize; iOffset+=nFrame){ u32 pgno; /* Database page number for frame */ u32 nTruncate; /* dbsize field from frame header */ int isValid; /* True if this frame is valid */ /* Read and decode the next log frame. */ rc = sqlite3OsRead(pFd, aFrame, nFrame, iOffset); if( rc!=SQLITE_OK ) break; |
︙ | ︙ | |||
819 820 821 822 823 824 825 | /* Sync the log file to disk */ rc = sqlite3OsSync(pLog->pFd, pLog->sync_flags); if( rc!=SQLITE_OK ) goto out; /* Iterate through the contents of the log, copying data to the db file. */ while( 0==logIteratorNext(pIter, &iDbpage, &iFrame) ){ rc = sqlite3OsRead(pLog->pFd, zBuf, pgsz, | | | 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 | /* Sync the log file to disk */ rc = sqlite3OsSync(pLog->pFd, pLog->sync_flags); if( rc!=SQLITE_OK ) goto out; /* Iterate through the contents of the log, copying data to the db file. */ while( 0==logIteratorNext(pIter, &iDbpage, &iFrame) ){ rc = sqlite3OsRead(pLog->pFd, zBuf, pgsz, logFrameOffset(iFrame, pgsz) + LOG_FRAME_HDRSIZE ); if( rc!=SQLITE_OK ) goto out; rc = sqlite3OsWrite(pFd, zBuf, pgsz, (iDbpage-1)*pgsz); if( rc!=SQLITE_OK ) goto out; } /* Truncate the database file */ |
︙ | ︙ | |||
1315 1316 1317 1318 1319 1320 1321 | } assert( iRead==0 || aData[logSummaryEntry(iRead)]==pgno ); /* If iRead is non-zero, then it is the log frame number that contains the ** required page. Read and return data from the log file. */ if( iRead ){ | < | | 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 1367 1368 1369 1370 | } assert( iRead==0 || aData[logSummaryEntry(iRead)]==pgno ); /* If iRead is non-zero, then it is the log frame number that contains the ** required page. Read and return data from the log file. */ if( iRead ){ i64 iOffset = logFrameOffset(iRead, pLog->hdr.pgsz) + LOG_FRAME_HDRSIZE; *pInLog = 1; return sqlite3OsRead(pLog->pFd, pOut, pLog->hdr.pgsz, iOffset); } *pInLog = 0; return SQLITE_OK; } |
︙ | ︙ | |||
1397 1398 1399 1400 1401 1402 1403 | Log *pLog, /* Log handle to write to */ int nPgsz, /* Database page-size in bytes */ PgHdr *pList, /* List of dirty pages to write */ Pgno nTruncate, /* Database size after this commit */ int isCommit, /* True if this is a commit */ int isSync /* True to sync the log file */ ){ | < < < < < < < < < < < < < < < < | | < | > > > > > > > > > > > > > > > > > < | < | 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 1447 1448 1449 1450 1451 1452 1453 1454 1455 1456 1457 1458 1459 1460 1461 1462 1463 1464 1465 1466 1467 1468 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 1484 1485 1486 1487 | Log *pLog, /* Log handle to write to */ int nPgsz, /* Database page-size in bytes */ PgHdr *pList, /* List of dirty pages to write */ Pgno nTruncate, /* Database size after this commit */ int isCommit, /* True if this is a commit */ int isSync /* True to sync the log file */ ){ int rc; /* Used to catch return codes */ u32 iFrame; /* Next frame address */ u8 aFrame[LOG_FRAME_HDRSIZE]; /* Buffer to assemble frame-header in */ PgHdr *p; /* Iterator to run through pList with. */ u32 aCksum[2]; /* Checksums */ PgHdr *pLast; /* Last frame in list */ int nLast = 0; /* Number of extra copies of last page */ assert( LOG_FRAME_HDRSIZE==(4 * 2 + LOG_CKSM_BYTES) ); assert( pList ); /* If this is the first frame written into the log, write the log ** header to the start of the log file. See comments at the top of ** this file for a description of the log-header format. */ assert( LOG_FRAME_HDRSIZE>=LOG_HDRSIZE ); iFrame = pLog->hdr.iLastPg; if( iFrame==0 ){ sqlite3Put4byte(aFrame, nPgsz); sqlite3_randomness(8, &aFrame[4]); pLog->hdr.iCheck1 = sqlite3Get4byte(&aFrame[4]); pLog->hdr.iCheck2 = sqlite3Get4byte(&aFrame[8]); rc = sqlite3OsWrite(pLog->pFd, aFrame, LOG_HDRSIZE, 0); if( rc!=SQLITE_OK ){ return rc; } } aCksum[0] = pLog->hdr.iCheck1; aCksum[1] = pLog->hdr.iCheck2; /* Write the log file. */ for(p=pList; p; p=p->pDirty){ u32 nDbsize; /* Db-size field for frame header */ i64 iOffset; /* Write offset in log file */ iOffset = logFrameOffset(++iFrame, nPgsz); /* Populate and write the frame header */ nDbsize = (isCommit && p->pDirty==0) ? nTruncate : 0; logEncodeFrame(aCksum, p->pgno, nDbsize, nPgsz, p->pData, aFrame); rc = sqlite3OsWrite(pLog->pFd, aFrame, sizeof(aFrame), iOffset); if( rc!=SQLITE_OK ){ return rc; |
︙ | ︙ |
Changes to test/wal.test.
︙ | ︙ | |||
24 25 26 27 28 29 30 31 32 33 34 35 36 37 | } proc sqlite3_wal {args} { eval sqlite3 $args [lindex $args 0] eval { PRAGMA journal_mode = wal } } # # These are 'warm-body' tests used while developing the WAL code. They # serve to prove that a few really simple cases work: # # wal-1.*: Read and write the database. # wal-2.*: Test MVCC with one reader, one writer. # wal-3.*: Test transaction rollback. | > > > > | 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 | } proc sqlite3_wal {args} { eval sqlite3 $args [lindex $args 0] eval { PRAGMA journal_mode = wal } } proc log_file_size {nFrame pgsz} { expr {12 + ($pgsz+16)*$nFrame} } # # These are 'warm-body' tests used while developing the WAL code. They # serve to prove that a few really simple cases work: # # wal-1.*: Read and write the database. # wal-2.*: Test MVCC with one reader, one writer. # wal-3.*: Test transaction rollback. |
︙ | ︙ | |||
54 55 56 57 58 59 60 | do_test wal-1.1 { execsql COMMIT list [file exists test.db-journal] [file exists test.db-wal] } {0 1} do_test wal-1.2 { # There are now two pages in the log. file size test.db-wal | | | 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 | do_test wal-1.1 { execsql COMMIT list [file exists test.db-journal] [file exists test.db-wal] } {0 1} do_test wal-1.2 { # There are now two pages in the log. file size test.db-wal } [log_file_size 2 1024] do_test wal-1.3 { execsql { SELECT * FROM sqlite_master } } {table t1 t1 2 {CREATE TABLE t1(a, b)}} do_test wal-1.4 { execsql { INSERT INTO t1 VALUES(1, 2) } |
︙ | ︙ | |||
204 205 206 207 208 209 210 | sqlite3_wal db test.db execsql { PRAGMA page_size = 1024; CREATE TABLE t1(a, b); INSERT INTO t1 VALUES(1, 2); } list [file size test.db] [file size test.db-wal] | | | | 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 | sqlite3_wal db test.db execsql { PRAGMA page_size = 1024; CREATE TABLE t1(a, b); INSERT INTO t1 VALUES(1, 2); } list [file size test.db] [file size test.db-wal] } [list 0 [log_file_size 3 1024]] do_test wal-7.2 { execsql { PRAGMA checkpoint } list [file size test.db] [file size test.db-wal] } [list 2048 [log_file_size 3 1024]] # Execute some transactions in auto-vacuum mode to test database file # truncation. # do_test wal-8.1 { reopen_db execsql { |
︙ | ︙ | |||
543 544 545 546 547 548 549 | PRAGMA page_size = 1024; CREATE TABLE t1(x PRIMARY KEY); } list [expr [file size test.db]/1024] [expr [file size test.db-wal]/1044] } {0 3} do_test wal-11.2 { execsql { PRAGMA checkpoint } | | | | | | | | | | | | | | | | | | | > > | | | 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 618 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 637 638 639 640 641 642 643 644 645 646 647 648 649 | PRAGMA page_size = 1024; CREATE TABLE t1(x PRIMARY KEY); } list [expr [file size test.db]/1024] [expr [file size test.db-wal]/1044] } {0 3} do_test wal-11.2 { execsql { PRAGMA checkpoint } list [expr [file size test.db]/1024] [file size test.db-wal] } [list 3 [log_file_size 3 1024]] do_test wal-11.3 { execsql { INSERT INTO t1 VALUES( randomblob(900) ) } list [expr [file size test.db]/1024] [file size test.db-wal] } [list 3 [log_file_size 4 1024]] do_test wal-11.4 { execsql { BEGIN; INSERT INTO t1 SELECT randomblob(900) FROM t1; -- 2 INSERT INTO t1 SELECT randomblob(900) FROM t1; -- 4 INSERT INTO t1 SELECT randomblob(900) FROM t1; -- 8 INSERT INTO t1 SELECT randomblob(900) FROM t1; -- 16 } list [expr [file size test.db]/1024] [file size test.db-wal] } [list 3 [log_file_size 33 1024]] do_test wal-11.5 { execsql { SELECT count(*) FROM t1; PRAGMA integrity_check; } } {16 ok} do_test wal-11.6 { execsql COMMIT list [expr [file size test.db]/1024] [file size test.db-wal] } [list 3 [log_file_size 42 1024]] do_test wal-11.7 { execsql { SELECT count(*) FROM t1; PRAGMA integrity_check; } } {16 ok} do_test wal-11.8 { execsql { PRAGMA checkpoint } list [expr [file size test.db]/1024] [file size test.db-wal] } [list 37 [log_file_size 42 1024]] do_test wal-11.9 { db close sqlite3_wal db test.db list [expr [file size test.db]/1024] [file size test.db-wal] } {37 0} do_test wal-11.10 { execsql { PRAGMA cache_size = 10; BEGIN; INSERT INTO t1 SELECT randomblob(900) FROM t1; -- 32 SELECT count(*) FROM t1; } list [expr [file size test.db]/1024] [file size test.db-wal] } [list 37 [log_file_size 40 1024]] do_test wal-11.11 { execsql { SELECT count(*) FROM t1; ROLLBACK; SELECT count(*) FROM t1; } } {32 16} do_test wal-11.12 { list [expr [file size test.db]/1024] [file size test.db-wal] } [list 37 [log_file_size 40 1024]] do_test wal-11.13 { execsql { INSERT INTO t1 VALUES( randomblob(900) ); SELECT count(*) FROM t1; PRAGMA integrity_check; } } {17 ok} do_test wal-11.14 { list [expr [file size test.db]/1024] [file size test.db-wal] } [list 37 [log_file_size 40 1024]] #------------------------------------------------------------------------- # This block of tests, wal-12.*, tests the fix for a problem that # could occur if a log that is a prefix of an older log is written # into a reused log file. # reopen_db do_test wal-12.1 { execsql { PRAGMA page_size = 1024; CREATE TABLE t1(x, y); CREATE TABLE t2(x, y); INSERT INTO t1 VALUES('A', 1); } list [expr [file size test.db]/1024] [file size test.db-wal] } [list 0 [log_file_size 5 1024]] do_test wal-12.2 { db close sqlite3_wal db test.db execsql { UPDATE t1 SET y = 0 WHERE x = 'A'; } list [expr [file size test.db]/1024] [expr [file size test.db-wal]/1044] |
︙ | ︙ |