Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Overview
Comment: | Two optimizations to the pager: (1) Write dirty pages back to the database file in order and (2) Keep a separate list of in-memory pages that are in the checkpoint journal in order to speed a checkpoint commit. (CVS 783) |
---|---|
Downloads: | Tarball | ZIP archive |
Timelines: | family | ancestors | descendants | both | trunk |
Files: | files | file ages | folders |
SHA1: |
a6ef6657a4377684dc2fce7be2bbf009 |
User & Date: | drh 2002-11-10 23:32:57.000 |
Context
2002-11-11
| ||
00:05 | Replace the atoi() library routine with a faster home-grown version in the VDBE. This gives a dramatic speed improvement for some kinds of queries. (CVS 784) (check-in: 263a8ca40f user: drh tags: trunk) | |
2002-11-10
| ||
23:32 | Two optimizations to the pager: (1) Write dirty pages back to the database file in order and (2) Keep a separate list of in-memory pages that are in the checkpoint journal in order to speed a checkpoint commit. (CVS 783) (check-in: a6ef6657a4 user: drh tags: trunk) | |
2002-11-09
| ||
00:33 | Try to better detect when the library is compiled for large file support (LFS) but the support is not available in the host OS kernel. (CVS 782) (check-in: a29d60ecc5 user: drh tags: trunk) | |
Changes
Changes to src/pager.c.
︙ | ︙ | |||
14 15 16 17 18 19 20 | ** The pager is used to access a database disk file. It implements ** atomic commit and rollback through the use of a journal file that ** is separate from the database file. The pager also implements file ** locking to prevent two processes from writing the same database ** file simultaneously, or one process from reading the database while ** another is writing. ** | | | 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 | ** The pager is used to access a database disk file. It implements ** atomic commit and rollback through the use of a journal file that ** is separate from the database file. The pager also implements file ** locking to prevent two processes from writing the same database ** file simultaneously, or one process from reading the database while ** another is writing. ** ** @(#) $Id: pager.c,v 1.57 2002/11/10 23:32:57 drh Exp $ */ #include "os.h" /* Must be first to enable large file support */ #include "sqliteInt.h" #include "pager.h" #include <assert.h> #include <string.h> |
︙ | ︙ | |||
70 71 72 73 74 75 76 77 78 79 80 81 82 83 | struct PgHdr { Pager *pPager; /* The pager to which this page belongs */ Pgno pgno; /* The page number for this page */ PgHdr *pNextHash, *pPrevHash; /* Hash collision chain for PgHdr.pgno */ int nRef; /* Number of users of this page */ PgHdr *pNextFree, *pPrevFree; /* Freelist of pages where nRef==0 */ PgHdr *pNextAll, *pPrevAll; /* A list of all pages */ u8 inJournal; /* TRUE if has been written to journal */ u8 inCkpt; /* TRUE if written to the checkpoint journal */ u8 dirty; /* TRUE if we need to write back changes */ u8 alwaysRollback; /* Disable dont_rollback() for this page */ /* SQLITE_PAGE_SIZE bytes of page data follow this header */ /* Pager.nExtra bytes of local data follow the page data */ }; | > > | 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 | struct PgHdr { Pager *pPager; /* The pager to which this page belongs */ Pgno pgno; /* The page number for this page */ PgHdr *pNextHash, *pPrevHash; /* Hash collision chain for PgHdr.pgno */ int nRef; /* Number of users of this page */ PgHdr *pNextFree, *pPrevFree; /* Freelist of pages where nRef==0 */ PgHdr *pNextAll, *pPrevAll; /* A list of all pages */ PgHdr *pNextCkpt, *pPrevCkpt; /* List of pages in the checkpoint journal */ PgHdr *pSort; /* Next in list of pages to be written */ u8 inJournal; /* TRUE if has been written to journal */ u8 inCkpt; /* TRUE if written to the checkpoint journal */ u8 dirty; /* TRUE if we need to write back changes */ u8 alwaysRollback; /* Disable dont_rollback() for this page */ /* SQLITE_PAGE_SIZE bytes of page data follow this header */ /* Pager.nExtra bytes of local data follow the page data */ }; |
︙ | ︙ | |||
126 127 128 129 130 131 132 133 134 135 136 137 138 139 | u8 dirtyFile; /* True if database file has changed in any way */ u8 alwaysRollback; /* Disable dont_rollback() for all pages */ u8 journalFormat; /* Version number of the journal file */ u8 *aInJournal; /* One bit for each page in the database file */ u8 *aInCkpt; /* One bit for each page in the database */ PgHdr *pFirst, *pLast; /* List of free pages */ PgHdr *pAll; /* List of all pages */ PgHdr *aHash[N_PG_HASH]; /* Hash table to map page number of PgHdr */ }; /* ** These are bits that can be set in Pager.errMask. */ #define PAGER_ERR_FULL 0x01 /* a write() failed */ | > | 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 | u8 dirtyFile; /* True if database file has changed in any way */ u8 alwaysRollback; /* Disable dont_rollback() for all pages */ u8 journalFormat; /* Version number of the journal file */ u8 *aInJournal; /* One bit for each page in the database file */ u8 *aInCkpt; /* One bit for each page in the database */ PgHdr *pFirst, *pLast; /* List of free pages */ PgHdr *pAll; /* List of all pages */ PgHdr *pCkpt; /* List of pages in the checkpoint journal */ PgHdr *aHash[N_PG_HASH]; /* Hash table to map page number of PgHdr */ }; /* ** These are bits that can be set in Pager.errMask. */ #define PAGER_ERR_FULL 0x01 /* a write() failed */ |
︙ | ︙ | |||
248 249 250 251 252 253 254 255 256 257 258 259 260 261 | if( pPager->errMask & PAGER_ERR_LOCK ) rc = SQLITE_PROTOCOL; if( pPager->errMask & PAGER_ERR_DISK ) rc = SQLITE_IOERR; if( pPager->errMask & PAGER_ERR_FULL ) rc = SQLITE_FULL; if( pPager->errMask & PAGER_ERR_MEM ) rc = SQLITE_NOMEM; if( pPager->errMask & PAGER_ERR_CORRUPT ) rc = SQLITE_CORRUPT; return rc; } /* ** Find a page in the hash table given its page number. Return ** a pointer to the page or NULL if not found. */ static PgHdr *pager_lookup(Pager *pPager, Pgno pgno){ PgHdr *p = pPager->aHash[pgno % N_PG_HASH]; | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 | if( pPager->errMask & PAGER_ERR_LOCK ) rc = SQLITE_PROTOCOL; if( pPager->errMask & PAGER_ERR_DISK ) rc = SQLITE_IOERR; if( pPager->errMask & PAGER_ERR_FULL ) rc = SQLITE_FULL; if( pPager->errMask & PAGER_ERR_MEM ) rc = SQLITE_NOMEM; if( pPager->errMask & PAGER_ERR_CORRUPT ) rc = SQLITE_CORRUPT; return rc; } /* ** Add or remove a page from the list of all pages that are in the ** checkpoint journal. ** ** The Pager keeps a separate list of pages that are currently in ** the checkpoint journal. This helps the sqlitepager_ckpt_commit() ** routine run MUCH faster for the common case where there are many ** pages in memory but only a few are in the checkpoint journal. */ static void page_add_to_ckpt_list(PgHdr *pPg){ Pager *pPager = pPg->pPager; if( pPg->inCkpt ) return; assert( pPg->pPrevCkpt==0 && pPg->pNextCkpt==0 ); pPg->pPrevCkpt = 0; if( pPager->pCkpt ){ pPager->pCkpt->pPrevCkpt = pPg; } pPg->pNextCkpt = pPager->pCkpt; pPager->pCkpt = pPg; pPg->inCkpt = 1; } static void page_remove_from_ckpt_list(PgHdr *pPg){ if( !pPg->inCkpt ) return; if( pPg->pPrevCkpt ){ assert( pPg->pPrevCkpt->pNextCkpt==pPg ); pPg->pPrevCkpt->pNextCkpt = pPg->pNextCkpt; }else{ assert( pPg->pPager->pCkpt==pPg ); pPg->pPager->pCkpt = pPg->pNextCkpt; } if( pPg->pNextCkpt ){ assert( pPg->pNextCkpt->pPrevCkpt==pPg ); pPg->pNextCkpt->pPrevCkpt = pPg->pPrevCkpt; } pPg->pNextCkpt = 0; pPg->pPrevCkpt = 0; pPg->inCkpt = 0; } /* ** Find a page in the hash table given its page number. Return ** a pointer to the page or NULL if not found. */ static PgHdr *pager_lookup(Pager *pPager, Pgno pgno){ PgHdr *p = pPager->aHash[pgno % N_PG_HASH]; |
︙ | ︙ | |||
747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 | ** a reference to the page data. */ int sqlitepager_ref(void *pData){ PgHdr *pPg = DATA_TO_PGHDR(pData); page_ref(pPg); return SQLITE_OK; } /* ** Sync the journal and then write all free dirty pages to the database ** file. ** ** Writing all free dirty pages to the database after the sync is a ** non-obvious optimization. fsync() is an expensive operation so we ** want to minimize the number ot times it is called. After an fsync() call, ** we are free to write dirty pages back to the database. It is best ** to go ahead and write as many dirty pages as possible to minimize ** the risk of having to do another fsync() later on. Writing dirty ** free pages in this way was observed to make database operations go ** up to 10 times faster. ** ** If we are writing to temporary database, there is no need to preserve ** the integrity of the journal file, so we can save time and skip the ** fsync(). */ static int syncAllPages(Pager *pPager){ PgHdr *pPg; int rc = SQLITE_OK; if( pPager->needSync ){ if( !pPager->tempFile ){ rc = sqliteOsSync(&pPager->jfd); if( rc!=0 ) return rc; } pPager->needSync = 0; } for(pPg=pPager->pFirst; pPg; pPg=pPg->pNextFree){ if( pPg->dirty ){ sqliteOsSeek(&pPager->fd, (pPg->pgno-1)*SQLITE_PAGE_SIZE); | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | | | < > | 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 | ** a reference to the page data. */ int sqlitepager_ref(void *pData){ PgHdr *pPg = DATA_TO_PGHDR(pData); page_ref(pPg); return SQLITE_OK; } /* ** The parameters are pointers to the head of two sorted lists ** of page headers. Merge these two lists together and return ** a single sorted list. This routine forms the core of the ** merge-sort algorithm that sorts dirty pages into accending ** order prior to writing them back to the disk. ** ** In the case of a tie, left sorts in front of right. ** ** Headers are sorted in order of ascending page number. */ static PgHdr *page_merge(PgHdr *pLeft, PgHdr *pRight){ PgHdr sHead; PgHdr *pTail; pTail = &sHead; pTail->pSort = 0; while( pLeft && pRight ){ if( pLeft->pgno<=pRight->pgno ){ pTail->pSort = pLeft; pLeft = pLeft->pSort; }else{ pTail->pSort = pRight; pRight = pRight->pSort; } pTail = pTail->pSort; } if( pLeft ){ pTail->pSort = pLeft; }else if( pRight ){ pTail->pSort = pRight; } return sHead.pSort; } /* ** Sync the journal and then write all free dirty pages to the database ** file. ** ** Writing all free dirty pages to the database after the sync is a ** non-obvious optimization. fsync() is an expensive operation so we ** want to minimize the number ot times it is called. After an fsync() call, ** we are free to write dirty pages back to the database. It is best ** to go ahead and write as many dirty pages as possible to minimize ** the risk of having to do another fsync() later on. Writing dirty ** free pages in this way was observed to make database operations go ** up to 10 times faster. ** ** If we are writing to temporary database, there is no need to preserve ** the integrity of the journal file, so we can save time and skip the ** fsync(). ** ** This routine goes to the extra trouble of sorting all the dirty ** pages by their page number prior to writing them. Tests show that ** writing pages in order by page number gives a modest speed improvement ** under Linux. */ static int syncAllPages(Pager *pPager){ PgHdr *pPg; PgHdr *pToWrite; # define NSORT 28 Pgno lastPgno; int i; PgHdr *apSorter[NSORT]; int rc = SQLITE_OK; /* Sync the journal before modifying the main database ** (assuming there is a journal and it needs to be synced.) */ if( pPager->needSync ){ if( !pPager->tempFile ){ rc = sqliteOsSync(&pPager->jfd); if( rc!=0 ) return rc; } pPager->needSync = 0; } /* Create a list of all dirty pages */ pToWrite = 0; for(pPg=pPager->pFirst; pPg; pPg=pPg->pNextFree){ if( pPg->dirty ){ pPg->pSort = pToWrite; pToWrite = pPg; } } /* Sort the list of dirty pages into accending order by ** page number */ for(i=0; i<NSORT; i++){ apSorter[i] = 0; } while( pToWrite ){ pPg = pToWrite; pToWrite = pPg->pSort; pPg->pSort = 0; for(i=0; i<NSORT-1; i++){ if( apSorter[i]==0 ){ apSorter[i] = pPg; break; }else{ pPg = page_merge(apSorter[i], pPg); apSorter[i] = 0; } } if( i>=NSORT-1 ){ apSorter[NSORT-1] = page_merge(apSorter[NSORT-1],pPg); } } pToWrite = 0; for(i=0; i<NSORT; i++){ pToWrite = page_merge(apSorter[i], pToWrite); } /* Write all dirty pages back to the database and mark ** them all clean. */ lastPgno = 0; for(pPg=pToWrite; pPg; pPg=pPg->pSort){ if( lastPgno==0 || pPg->pgno!=lastPgno-1 ){ sqliteOsSeek(&pPager->fd, (pPg->pgno-1)*SQLITE_PAGE_SIZE); } rc = sqliteOsWrite(&pPager->fd, PGHDR_TO_DATA(pPg), SQLITE_PAGE_SIZE); if( rc!=SQLITE_OK ) break; pPg->dirty = 0; lastPgno = pPg->pgno; } return rc; } /* ** Acquire a page. ** |
︙ | ︙ | |||
980 981 982 983 984 985 986 | } pPg->pgno = pgno; if( pPager->aInJournal && (int)pgno<=pPager->origDbSize ){ pPg->inJournal = (pPager->aInJournal[pgno/8] & (1<<(pgno&7)))!=0; }else{ pPg->inJournal = 0; } | | | > | | 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 1132 1133 | } pPg->pgno = pgno; if( pPager->aInJournal && (int)pgno<=pPager->origDbSize ){ pPg->inJournal = (pPager->aInJournal[pgno/8] & (1<<(pgno&7)))!=0; }else{ pPg->inJournal = 0; } if( pPager->aInCkpt && (int)pgno<=pPager->ckptSize && (pPager->aInCkpt[pgno/8] & (1<<(pgno&7)))!=0 ){ page_add_to_ckpt_list(pPg); }else{ page_remove_from_ckpt_list(pPg); } pPg->dirty = 0; pPg->nRef = 1; REFINFO(pPg); pPager->nRef++; h = pager_hash(pgno); pPg->pNextHash = pPager->aHash[h]; |
︙ | ︙ | |||
1244 1245 1246 1247 1248 1249 1250 | } assert( pPager->aInJournal!=0 ); pPager->aInJournal[pPg->pgno/8] |= 1<<(pPg->pgno&7); pPager->needSync = !pPager->noSync; pPg->inJournal = 1; if( pPager->ckptInUse ){ pPager->aInCkpt[pPg->pgno/8] |= 1<<(pPg->pgno&7); | | | | 1380 1381 1382 1383 1384 1385 1386 1387 1388 1389 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 1413 1414 | } assert( pPager->aInJournal!=0 ); pPager->aInJournal[pPg->pgno/8] |= 1<<(pPg->pgno&7); pPager->needSync = !pPager->noSync; pPg->inJournal = 1; if( pPager->ckptInUse ){ pPager->aInCkpt[pPg->pgno/8] |= 1<<(pPg->pgno&7); page_add_to_ckpt_list(pPg); } } /* If the checkpoint journal is open and the page is not in it, ** then write the current page to the checkpoint journal. */ if( pPager->ckptInUse && !pPg->inCkpt && (int)pPg->pgno<=pPager->ckptSize ){ assert( pPg->inJournal || (int)pPg->pgno>pPager->origDbSize ); rc = write32bits(&pPager->cpfd, pPg->pgno); if( rc==SQLITE_OK ){ rc = sqliteOsWrite(&pPager->cpfd, pData, SQLITE_PAGE_SIZE); } if( rc!=SQLITE_OK ){ sqlitepager_rollback(pPager); pPager->errMask |= PAGER_ERR_FULL; return rc; } assert( pPager->aInCkpt!=0 ); pPager->aInCkpt[pPg->pgno/8] |= 1<<(pPg->pgno&7); page_add_to_ckpt_list(pPg); } /* Update the database size and return. */ if( pPager->dbSize<(int)pPg->pgno ){ pPager->dbSize = pPg->pgno; } |
︙ | ︙ | |||
1348 1349 1350 1351 1352 1353 1354 | if( pPg->alwaysRollback || pPager->alwaysRollback ) return; if( !pPg->inJournal && (int)pPg->pgno <= pPager->origDbSize ){ assert( pPager->aInJournal!=0 ); pPager->aInJournal[pPg->pgno/8] |= 1<<(pPg->pgno&7); pPg->inJournal = 1; if( pPager->ckptInUse ){ pPager->aInCkpt[pPg->pgno/8] |= 1<<(pPg->pgno&7); | | | | 1484 1485 1486 1487 1488 1489 1490 1491 1492 1493 1494 1495 1496 1497 1498 1499 1500 1501 1502 1503 1504 1505 | if( pPg->alwaysRollback || pPager->alwaysRollback ) return; if( !pPg->inJournal && (int)pPg->pgno <= pPager->origDbSize ){ assert( pPager->aInJournal!=0 ); pPager->aInJournal[pPg->pgno/8] |= 1<<(pPg->pgno&7); pPg->inJournal = 1; if( pPager->ckptInUse ){ pPager->aInCkpt[pPg->pgno/8] |= 1<<(pPg->pgno&7); page_add_to_ckpt_list(pPg); } } if( pPager->ckptInUse && !pPg->inCkpt && (int)pPg->pgno<=pPager->ckptSize ){ assert( pPg->inJournal || (int)pPg->pgno>pPager->origDbSize ); assert( pPager->aInCkpt!=0 ); pPager->aInCkpt[pPg->pgno/8] |= 1<<(pPg->pgno&7); page_add_to_ckpt_list(pPg); } } /* ** Commit all changes to the database and release the write lock. ** ** If the commit fails for any reason, a rollback attempt is made |
︙ | ︙ | |||
1517 1518 1519 1520 1521 1522 1523 | } /* ** Commit a checkpoint. */ int sqlitepager_ckpt_commit(Pager *pPager){ if( pPager->ckptInUse ){ | | | > > > > | 1653 1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 1668 1669 1670 1671 1672 1673 1674 1675 1676 1677 1678 1679 | } /* ** Commit a checkpoint. */ int sqlitepager_ckpt_commit(Pager *pPager){ if( pPager->ckptInUse ){ PgHdr *pPg, *pNext; sqliteOsSeek(&pPager->cpfd, 0); sqliteOsTruncate(&pPager->cpfd, 0); pPager->ckptInUse = 0; sqliteFree( pPager->aInCkpt ); pPager->aInCkpt = 0; for(pPg=pPager->pCkpt; pPg; pPg=pNext){ pNext = pPg->pNextCkpt; assert( pPg->inCkpt ); pPg->inCkpt = 0; pPg->pPrevCkpt = pPg->pNextCkpt = 0; } pPager->pCkpt = 0; } return SQLITE_OK; } /* ** Rollback a checkpoint. */ |
︙ | ︙ |