Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Overview
Comment: | Fix inaccuracies and add details to comments in the pager. Change the name of one function to make its purpose clearer. Ticket #599. (CVS 1209) |
---|---|
Downloads: | Tarball | ZIP archive |
Timelines: | family | ancestors | descendants | both | trunk |
Files: | files | file ages | folders |
SHA1: |
48832d35ed0d5ba02908822c749591e7 |
User & Date: | drh 2004-02-08 06:05:46.000 |
Context
2004-02-08
| ||
06:06 | Add the crashtest1.c program used to test the ability of the database to survive a program crash or power failure. Ticket #599. (CVS 1210) (check-in: 597a59a72d user: drh tags: trunk) | |
06:05 | Fix inaccuracies and add details to comments in the pager. Change the name of one function to make its purpose clearer. Ticket #599. (CVS 1209) (check-in: 48832d35ed user: drh tags: trunk) | |
00:40 | Preliminary fix for ticket #599. More testing and analysis needed. (CVS 1208) (check-in: dc5be2c82b user: drh tags: trunk) | |
Changes
Changes to src/pager.c.
︙ | ︙ | |||
14 15 16 17 18 19 20 | ** The pager is used to access a database disk file. It implements ** atomic commit and rollback through the use of a journal file that ** is separate from the database file. The pager also implements file ** locking to prevent two processes from writing the same database ** file simultaneously, or one process from reading the database while ** another is writing. ** | | | 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 | ** The pager is used to access a database disk file. It implements ** atomic commit and rollback through the use of a journal file that ** is separate from the database file. The pager also implements file ** locking to prevent two processes from writing the same database ** file simultaneously, or one process from reading the database while ** another is writing. ** ** @(#) $Id: pager.c,v 1.94 2004/02/08 06:05:46 drh Exp $ */ #include "os.h" /* Must be first to enable large file support */ #include "sqliteInt.h" #include "pager.h" #include <assert.h> #include <string.h> |
︙ | ︙ | |||
142 143 144 145 146 147 148 | int nExtra; /* Add this many bytes to each in-memory page */ void (*xDestructor)(void*); /* Call this routine when freeing pages */ int nPage; /* Total number of in-memory pages */ int nRef; /* Number of in-memory pages with PgHdr.nRef>0 */ int mxPage; /* Maximum number of pages to hold in cache */ int nHit, nMiss, nOvfl; /* Cache hits, missing, and LRU overflows */ u8 journalOpen; /* True if journal file descriptors is valid */ | | | | 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 | int nExtra; /* Add this many bytes to each in-memory page */ void (*xDestructor)(void*); /* Call this routine when freeing pages */ int nPage; /* Total number of in-memory pages */ int nRef; /* Number of in-memory pages with PgHdr.nRef>0 */ int mxPage; /* Maximum number of pages to hold in cache */ int nHit, nMiss, nOvfl; /* Cache hits, missing, and LRU overflows */ u8 journalOpen; /* True if journal file descriptors is valid */ u8 journalStarted; /* True if header of journal is synced */ u8 useJournal; /* Use a rollback journal on this file */ u8 ckptOpen; /* True if the checkpoint journal is open */ u8 ckptInUse; /* True we are in a checkpoint */ u8 ckptAutoopen; /* Open ckpt journal when main journal is opened*/ u8 noSync; /* Do not sync the journal if true */ u8 fullSync; /* Do extra syncs of the journal for robustness */ u8 state; /* SQLITE_UNLOCK, _READLOCK or _WRITELOCK */ u8 errMask; /* One of several kinds of errors */ |
︙ | ︙ | |||
275 276 277 278 279 280 281 | } # define REFINFO(X) pager_refinfo(X) #else # define REFINFO(X) #endif /* | | > > > > > > | > > > > > | > > > | 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 | } # define REFINFO(X) pager_refinfo(X) #else # define REFINFO(X) #endif /* ** Read a 32-bit integer from the given file descriptor. Store the integer ** that is read in *pRes. Return SQLITE_OK if everything worked, or an ** error code is something goes wrong. ** ** If the journal format is 2 or 3, read a big-endian integer. If the ** journal format is 1, read an integer in the native byte-order of the ** host machine. */ static int read32bits(int format, OsFile *fd, u32 *pRes){ u32 res; int rc; rc = sqliteOsRead(fd, &res, sizeof(res)); if( rc==SQLITE_OK && format>JOURNAL_FORMAT_1 ){ unsigned char ac[4]; memcpy(ac, &res, 4); res = (ac[0]<<24) | (ac[1]<<16) | (ac[2]<<8) | ac[3]; } *pRes = res; return rc; } /* ** Write a 32-bit integer into the given file descriptor. Return SQLITE_OK ** on success or an error code is something goes wrong. ** ** If the journal format is 2 or 3, write the integer as 4 big-endian ** bytes. If the journal format is 1, write the integer in the native ** byte order. In normal operation, only formats 2 and 3 are used. ** Journal format 1 is only used for testing. */ static int write32bits(OsFile *fd, u32 val){ unsigned char ac[4]; if( journal_format<=1 ){ return sqliteOsWrite(fd, &val, 4); } ac[0] = (val>>24) & 0xff; ac[1] = (val>>16) & 0xff; ac[2] = (val>>8) & 0xff; ac[3] = val & 0xff; return sqliteOsWrite(fd, ac, 4); } /* ** Write a 32-bit integer into a page header right before the ** page data. This will overwrite the PgHdr.pDirty pointer. ** ** The integer is big-endian for formats 2 and 3 and native byte order ** for journal format 1. */ static void store32bits(u32 val, PgHdr *p, int offset){ unsigned char *ac; ac = &((unsigned char*)PGHDR_TO_DATA(p))[offset]; if( journal_format<=1 ){ memcpy(ac, &val, 4); }else{ |
︙ | ︙ | |||
465 466 467 468 469 470 471 472 473 474 475 476 477 478 | pPager->state = SQLITE_UNLOCK; } return rc; } /* ** Compute and return a checksum for the page of data. */ static u32 pager_cksum(Pager *pPager, Pgno pgno, const char *aData){ u32 cksum = pPager->cksumInit + pgno; return cksum; } /* | > > > > | 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 | pPager->state = SQLITE_UNLOCK; } return rc; } /* ** Compute and return a checksum for the page of data. ** ** This is not a real checksum. It is really just the sum of the ** random initial value and the page number. We considered do a checksum ** of the database, but that was found to be too slow. */ static u32 pager_cksum(Pager *pPager, Pgno pgno, const char *aData){ u32 cksum = pPager->cksumInit + pgno; return cksum; } /* |
︙ | ︙ | |||
533 534 535 536 537 538 539 | return rc; } /* ** Playback the journal and thus restore the database file to ** the state it was in before we started making changes. ** | | > > > > > > > > > | | | | > | > > > > > > > > > > > > > > > > > > > > > > > > | < < | | | | | 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 602 603 604 605 606 607 608 609 610 611 | return rc; } /* ** Playback the journal and thus restore the database file to ** the state it was in before we started making changes. ** ** The journal file format is as follows: ** ** * 8 byte prefix. One of the aJournalMagic123 vectors defined ** above. The format of the journal file is determined by which ** of the three prefix vectors is seen. ** * 4 byte big-endian integer which is the number of valid page records ** in the journal. If this value is 0xffffffff, then compute the ** number of page records from the journal size. This field appears ** in format 3 only. ** * 4 byte big-endian integer which is the initial value for the ** sanity checksum. This field appears in format 3 only. ** * 4 byte integer which is the number of pages to truncate the ** database to during a rollback. ** * Zero or more pages instances, each as follows: ** + 4 byte page number. ** + SQLITE_PAGE_SIZE bytes of data. ** + 4 byte checksum (format 3 only) ** ** When we speak of the journal header, we mean the first 4 bullets above. ** Each entry in the journal is an instance of the 5th bullet. Note that ** bullets 2 and 3 only appear in format-3 journals. ** ** Call the value from the second bullet "nRec". nRec is the number of ** valid page entries in the journal. In most cases, you can compute the ** value of nRec from the size of the journal file. But if a power ** failure occurred while the journal was being written, it could be the ** case that the size of the journal file had already been increased but ** the extra entries had not yet made it safely to disk. In such a case, ** the value of nRec computed from the file size would be too large. For ** that reason, we always use the nRec value in the header. ** ** If the nRec value is 0xffffffff it means that nRec should be computed ** from the file size. This value is used when the user selects the ** no-sync option for the journal. A power failure could lead to corruption ** in this case. But for things like temporary table (which will be ** deleted when the power is restored) we don't care. ** ** Journal formats 1 and 2 do not have an nRec value in the header so we ** have to compute nRec from the file size. This has risks (as described ** above) which is why all persistent tables have been changed to use ** format 3. ** ** If the file opened as the journal file is not a well-formed ** journal file then the database will likely already be ** corrupted, so the PAGER_ERR_CORRUPT bit is set in pPager->errMask ** and SQLITE_CORRUPT is returned. If it all works, then this routine ** returns SQLITE_OK. */ static int pager_playback(Pager *pPager, int useJournalSize){ off_t szJ; /* Size of the journal file in bytes */ int nRec; /* Number of Records in the journal */ int i; /* Loop counter */ Pgno mxPg = 0; /* Size of the original file in pages */ int format; /* Format of the journal file. */ |
︙ | ︙ | |||
569 570 571 572 573 574 575 | sqliteOsSeek(&pPager->jfd, 0); rc = sqliteOsFileSize(&pPager->jfd, &szJ); if( rc!=SQLITE_OK ){ goto end_playback; } /* If the journal file is too small to contain a complete header, | > | > > | 619 620 621 622 623 624 625 626 627 628 629 630 631 632 633 634 635 636 | sqliteOsSeek(&pPager->jfd, 0); rc = sqliteOsFileSize(&pPager->jfd, &szJ); if( rc!=SQLITE_OK ){ goto end_playback; } /* If the journal file is too small to contain a complete header, ** it must mean that the process that created the journal was just ** beginning to write the journal file when it died. In that case, ** the database file should have still been completely unchanged. ** Nothing needs to be rolled back. We can safely ignore this journal. */ if( szJ < sizeof(aMagic)+sizeof(Pgno) ){ goto end_playback; } /* Read the beginning of the journal and truncate the ** database file back to its original size. |
︙ | ︙ | |||
599 600 601 602 603 604 605 | } if( format>=JOURNAL_FORMAT_3 ){ if( szJ < sizeof(aMagic) + 3*sizeof(u32) ){ /* Ignore the journal if it is too small to contain a complete ** header. We already did this test once above, but at the prior ** test, we did not know the journal format and so we had to assume ** the smallest possible header. Now we know the header is bigger | | | 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 | } if( format>=JOURNAL_FORMAT_3 ){ if( szJ < sizeof(aMagic) + 3*sizeof(u32) ){ /* Ignore the journal if it is too small to contain a complete ** header. We already did this test once above, but at the prior ** test, we did not know the journal format and so we had to assume ** the smallest possible header. Now we know the header is bigger ** than the minimum so we test again. */ goto end_playback; } rc = read32bits(format, &pPager->jfd, (u32*)&nRec); if( rc ) goto end_playback; rc = read32bits(format, &pPager->jfd, &pPager->cksumInit); if( rc ) goto end_playback; |
︙ | ︙ | |||
781 782 783 784 785 786 787 | ** database. This is normally adequate protection, but ** it is theoretically possible, though very unlikely, ** that an inopertune power failure could leave the journal ** in a state which would cause damage to the database ** when it is rolled back. ** ** FULL The journal is synced twice before writes begin on the | | > | | 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 | ** database. This is normally adequate protection, but ** it is theoretically possible, though very unlikely, ** that an inopertune power failure could leave the journal ** in a state which would cause damage to the database ** when it is rolled back. ** ** FULL The journal is synced twice before writes begin on the ** database (with some additional information - the nRec field ** of the journal header - being written in between the two ** syncs). If we assume that writing a ** single disk sector is atomic, then this mode provides ** assurance that the journal will not be corrupted to the ** point of causing damage to the database during rollback. ** ** Numeric values associated with these states are OFF==1, NORMAL=2, ** and FULL=3. */ |
︙ | ︙ | |||
942 943 944 945 946 947 948 | } return n; } /* ** Forward declaration */ | | | | 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 | } return n; } /* ** Forward declaration */ static int syncJournal(Pager*); /* ** Truncate the file to the number of pages specified. */ int sqlitepager_truncate(Pager *pPager, Pgno nPage){ int rc; if( pPager->dbSize<0 ){ sqlitepager_pagecount(pPager); } if( pPager->errMask!=0 ){ rc = pager_errcode(pPager); return rc; } if( nPage>=(unsigned)pPager->dbSize ){ return SQLITE_OK; } syncJournal(pPager); rc = sqliteOsTruncate(&pPager->fd, SQLITE_PAGE_SIZE*(off_t)nPage); if( rc==SQLITE_OK ){ pPager->dbSize = nPage; } return rc; } |
︙ | ︙ | |||
1065 1066 1067 1068 1069 1070 1071 | int sqlitepager_ref(void *pData){ PgHdr *pPg = DATA_TO_PGHDR(pData); page_ref(pPg); return SQLITE_OK; } /* | | > > > > > > | > > > > | < | < < < < < < < > | | > > > > | 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 1134 1135 1136 1137 1138 1139 1140 1141 1142 1143 1144 1145 1146 1147 1148 1149 1150 1151 1152 1153 1154 1155 1156 1157 1158 1159 1160 1161 1162 1163 1164 1165 1166 1167 1168 1169 1170 1171 1172 1173 1174 1175 1176 1177 | int sqlitepager_ref(void *pData){ PgHdr *pPg = DATA_TO_PGHDR(pData); page_ref(pPg); return SQLITE_OK; } /* ** Sync the journal. In other words, make sure all the pages that have ** been written to the journal have actually reached the surface of the ** disk. It is not safe to modify the original database file until after ** the journal has been synced. If the original database is modified before ** the journal is synced and a power failure occurs, the unsynced journal ** data would be lost and we would be unable to completely rollback the ** database changes. Database corruption would occur. ** ** This routine also updates the nRec field in the header of the journal. ** (See comments on the pager_playback() routine for additional information.) ** If the sync mode is FULL, two syncs will occur. First the whole journal ** is synced, then the nRec field is updated, then a second sync occurs. ** ** For temporary databases, we do not care if we are able to rollback ** after a power failure, so sync occurs. ** ** This routine clears the needSync field of every page current held in ** memory. */ static int syncJournal(Pager *pPager){ PgHdr *pPg; int rc = SQLITE_OK; /* Sync the journal before modifying the main database ** (assuming there is a journal and it needs to be synced.) */ if( pPager->needSync ){ if( !pPager->tempFile ){ assert( pPager->journalOpen ); assert( !pPager->noSync ); #ifndef NDEBUG { /* Make sure the pPager->nRec counter we are keeping agrees ** with the nRec computed from the size of the journal file. */ off_t hdrSz, pgSz, jSz; hdrSz = JOURNAL_HDR_SZ(journal_format); pgSz = JOURNAL_PG_SZ(journal_format); rc = sqliteOsFileSize(&pPager->jfd, &jSz); if( rc!=0 ) return rc; assert( pPager->nRec*pgSz+hdrSz==jSz ); } #endif if( journal_format>=3 ){ /* Write the nRec value into the journal file header */ off_t szJ; if( pPager->fullSync ){ TRACE1("SYNC\n"); rc = sqliteOsSync(&pPager->jfd); if( rc!=0 ) return rc; } sqliteOsSeek(&pPager->jfd, sizeof(aJournalMagic1)); |
︙ | ︙ | |||
1313 1314 1315 1316 1317 1318 1319 | /* If we could not find a page that does not require an fsync() ** on the journal file then fsync the journal file. This is a ** very slow operation, so we work hard to avoid it. But sometimes ** it can't be helped. */ if( pPg==0 ){ | | | 1374 1375 1376 1377 1378 1379 1380 1381 1382 1383 1384 1385 1386 1387 1388 | /* If we could not find a page that does not require an fsync() ** on the journal file then fsync the journal file. This is a ** very slow operation, so we work hard to avoid it. But sometimes ** it can't be helped. */ if( pPg==0 ){ int rc = syncJournal(pPager); if( rc!=0 ){ sqlitepager_rollback(pPager); return SQLITE_IOERR; } pPg = pPager->pFirst; } assert( pPg->nRef==0 ); |
︙ | ︙ | |||
1905 1906 1907 1908 1909 1910 1911 | ** if there have been no changes to the database file. */ assert( pPager->needSync==0 ); rc = pager_unwritelock(pPager); pPager->dbSize = -1; return rc; } assert( pPager->journalOpen ); | | | 1966 1967 1968 1969 1970 1971 1972 1973 1974 1975 1976 1977 1978 1979 1980 | ** if there have been no changes to the database file. */ assert( pPager->needSync==0 ); rc = pager_unwritelock(pPager); pPager->dbSize = -1; return rc; } assert( pPager->journalOpen ); rc = syncJournal(pPager); if( rc!=SQLITE_OK ){ goto commit_abort; } pPg = pager_get_all_dirty_pages(pPager); if( pPg ){ rc = pager_write_pagelist(pPg); if( rc || (!pPager->noSync && sqliteOsSync(&pPager->fd)!=SQLITE_OK) ){ |
︙ | ︙ |