Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Overview
Comment: | Refactoring of the WalIterator implementation. |
---|---|
Downloads: | Tarball | ZIP archive |
Timelines: | family | ancestors | descendants | both | trunk |
Files: | files | file ages | folders |
SHA1: |
b5b60fdcc5dcf41f2c79912075ac241f |
User & Date: | drh 2010-05-18 18:01:09.000 |
Context
2010-05-18
| ||
23:29 | Update the wal-index hash format so that hash-table space is reused following a rollback, thus preventing hash table overflows. Add assert()s to verify that hash tables do not overfill. Further refactoring of the wal-index code. (check-in: ada9a8c7b6 user: drh tags: trunk) | |
18:01 | Refactoring of the WalIterator implementation. (check-in: b5b60fdcc5 user: drh tags: trunk) | |
13:27 | Mark the shared-memory in the WAL implementation as volatile. (check-in: 0a6787908e user: drh tags: trunk) | |
Changes
Changes to src/wal.c.
︙ | ︙ | |||
40 41 42 43 44 45 46 | ** 8: Checksum value 1. ** 12: Checksum value 2. */ /* ** WAL-INDEX FILE FORMAT ** | | | 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 | ** 8: Checksum value 1. ** 12: Checksum value 2. */ /* ** WAL-INDEX FILE FORMAT ** ** The wal-index consists of a header region, followed by an ** 8-byte region that contains no useful data (used to apply byte-range locks ** in some implementations), followed by the data region. ** ** The contents of both the header and data region are specified in terms ** of 1, 2 and 4 byte unsigned integers. All integers are stored in ** machine-endian order. The wal-index is not a persistent file and ** so it does not need to be portable across archtectures. |
︙ | ︙ | |||
130 131 132 133 134 135 136 | u8 isWindexOpen; /* True if ShmOpen() called on pDbFd */ WalIndexHdr hdr; /* Wal-index for current snapshot */ char *zWalName; /* Name of WAL file */ }; /* | | | | > > | | | | | | | 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 | u8 isWindexOpen; /* True if ShmOpen() called on pDbFd */ WalIndexHdr hdr; /* Wal-index for current snapshot */ char *zWalName; /* Name of WAL file */ }; /* ** This structure is used to implement an iterator that loops through ** all frames in the WAL in database page order. Where two or more frames ** correspond to the same database page, the iterator visits only the ** frame most recently written to the WAL (in other words, the frame with ** the largest index). ** ** The internals of this structure are only accessed by: ** ** walIteratorInit() - Create a new iterator, ** walIteratorNext() - Step an iterator, ** walIteratorFree() - Free an iterator. ** ** This functionality is used by the checkpoint code (see walCheckpoint()). */ struct WalIterator { int iPrior; /* Last result returned from the iterator */ int nSegment; /* Size of the aSegment[] array */ int nFinal; /* Elements in aSegment[nSegment-1] */ struct WalSegment { int iNext; /* Next slot in aIndex[] not previously returned */ u8 *aIndex; /* i0, i1, i2... such that aPgno[iN] ascending */ u32 *aPgno; /* 256 page numbers. Pointer to Wal.pWiData */ } aSegment[1]; /* One for every 256 entries in the WAL */ }; /* ** Generate an 8 byte checksum based on the data in array aByte[] and the ** initial values of aCksum[0] and aCksum[1]. The checksum is written into ** aCksum[] before returning. |
︙ | ︙ | |||
298 299 300 301 302 303 304 | } *piPage = sqlite3Get4byte(&aFrame[0]); *pnTruncate = sqlite3Get4byte(&aFrame[4]); return 1; } | < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < | | > > > | > > > > > > | 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 | } *piPage = sqlite3Get4byte(&aFrame[0]); *pnTruncate = sqlite3Get4byte(&aFrame[4]); return 1; } /* ** Define the size of the hash tables in the wal-index file. There ** is a hash-table following every HASHTABLE_NPAGE page numbers in the ** wal-index. */ #define HASHTABLE_NPAGE 4096 #define HASHTABLE_DATATYPE u16 #define HASHTABLE_NSLOT (HASHTABLE_NPAGE*2) #define HASHTABLE_NBYTE (sizeof(HASHTABLE_DATATYPE)*HASHTABLE_NSLOT) /* ** Return the index in the Wal.pWiData array that corresponds to ** frame iFrame. ** ** Wal.pWiData is an array of u32 elements that is the wal-index. ** The array begins with a header and is then followed by alternating ** "map" and "hash-table" blocks. Each "map" block consists of ** HASHTABLE_NPAGE u32 elements which are page numbers corresponding ** to frames in the WAL file. ** ** This routine returns an index X such that Wal.pWiData[X] is part ** of a "map" block that contains the page number of the iFrame-th ** frame in the WAL file. */ static int walIndexEntry(u32 iFrame){ return ( (WALINDEX_LOCK_OFFSET+WALINDEX_LOCK_RESERVED)/sizeof(u32) + (((iFrame-1)/HASHTABLE_NPAGE) * HASHTABLE_NBYTE)/sizeof(u32) + (iFrame-1) ); |
︙ | ︙ | |||
711 712 713 714 715 716 717 718 719 | sqlite3_free(pRet); }else{ *ppWal = pRet; } return rc; } static int walIteratorNext( WalIterator *p, /* Iterator */ | > > > > > > > > > > | | | | | | > > | | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | | | | | | | | | > > > > > > > > > > > < | < < > | < > > > > > | | | | | | | | | | | | | | | | | | | | 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 779 780 781 782 783 784 785 786 787 788 789 790 791 792 793 794 795 796 797 798 799 800 801 802 803 804 805 806 807 808 809 810 811 812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827 828 829 830 831 832 833 834 835 836 837 838 839 840 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 | sqlite3_free(pRet); }else{ *ppWal = pRet; } return rc; } /* ** Find the smallest page number out of all pages held in the WAL that ** has not been returned by any prior invocation of this method on the ** same WalIterator object. Write into *piFrame the frame index where ** that page was last written into the WAL. Write into *piPage the page ** number. ** ** Return 0 on success. If there are no pages in the WAL with a page ** number larger than *piPage, then return 1. */ static int walIteratorNext( WalIterator *p, /* Iterator */ u32 *piPage, /* OUT: The page number of the next page */ u32 *piFrame /* OUT: Wal frame index of next page */ ){ u32 iMin; /* Result pgno must be greater than iMin */ u32 iRet = 0xFFFFFFFF; /* 0xffffffff is never a valid page number */ int i; /* For looping through segments */ int nBlock = p->nFinal; /* Number of entries in current segment */ iMin = p->iPrior; assert( iMin<0xffffffff ); for(i=p->nSegment-1; i>=0; i--){ struct WalSegment *pSegment = &p->aSegment[i]; while( pSegment->iNext<nBlock ){ u32 iPg = pSegment->aPgno[pSegment->aIndex[pSegment->iNext]]; if( iPg>iMin ){ if( iPg<iRet ){ iRet = iPg; *piFrame = i*256 + 1 + pSegment->aIndex[pSegment->iNext]; } break; } pSegment->iNext++; } nBlock = 256; } *piPage = p->iPrior = iRet; return (iRet==0xFFFFFFFF); } static void walMergesort8( Pgno *aContent, /* Pages in wal */ u8 *aBuffer, /* Buffer of at least *pnList items to use */ u8 *aList, /* IN/OUT: List to sort */ int *pnList /* IN/OUT: Number of elements in aList[] */ ){ int nList = *pnList; if( nList>1 ){ int nLeft = nList / 2; /* Elements in left list */ int nRight = nList - nLeft; /* Elements in right list */ u8 *aLeft = aList; /* Left list */ u8 *aRight = &aList[nLeft]; /* Right list */ int iLeft = 0; /* Current index in aLeft */ int iRight = 0; /* Current index in aright */ int iOut = 0; /* Current index in output buffer */ /* TODO: Change to non-recursive version. */ walMergesort8(aContent, aBuffer, aLeft, &nLeft); walMergesort8(aContent, aBuffer, aRight, &nRight); while( iRight<nRight || iLeft<nLeft ){ u8 logpage; Pgno dbpage; if( (iLeft<nLeft) && (iRight>=nRight || aContent[aLeft[iLeft]]<aContent[aRight[iRight]]) ){ logpage = aLeft[iLeft++]; }else{ logpage = aRight[iRight++]; } dbpage = aContent[logpage]; aBuffer[iOut++] = logpage; if( iLeft<nLeft && aContent[aLeft[iLeft]]==dbpage ) iLeft++; assert( iLeft>=nLeft || aContent[aLeft[iLeft]]>dbpage ); assert( iRight>=nRight || aContent[aRight[iRight]]>dbpage ); } memcpy(aList, aBuffer, sizeof(aList[0])*iOut); *pnList = iOut; } #ifdef SQLITE_DEBUG { int i; for(i=1; i<*pnList; i++){ assert( aContent[aList[i]] > aContent[aList[i-1]] ); } } #endif } /* ** Map the wal-index into memory owned by this thread, if it is not ** mapped already. Then construct a WalInterator object that can be ** used to loop over all pages in the WAL in ascending order. ** ** On success, make *pp point to the newly allocated WalInterator object ** return SQLITE_OK. Otherwise, leave *pp unchanged and return an error ** code. ** ** The calling routine should invoke walIteratorFree() to destroy the ** WalIterator object when it has finished with it. The caller must ** also unmap the wal-index. But the wal-index must not be unmapped ** prior to the WalIterator object being destroyed. */ static int walIteratorInit(Wal *pWal, WalIterator **pp){ u32 *aData; /* Content of the wal-index file */ WalIterator *p; /* Return value */ int nSegment; /* Number of segments to merge */ u32 iLast; /* Last frame in log */ int nByte; /* Number of bytes to allocate */ int i; /* Iterator variable */ int nFinal; /* Number of unindexed entries */ u8 *aTmp; /* Temp space used by merge-sort */ int rc; /* Return code of walIndexMap() */ u8 *aSpace; /* Surplus space on the end of the allocation */ /* Make sure the wal-index is mapped into local memory */ rc = walIndexMap(pWal, walMappingSize(pWal->hdr.iLastPg)); if( rc!=SQLITE_OK ){ return rc; } /* This routine only runs while holding SQLITE_SHM_CHECKPOINT. No other ** thread is able to write to shared memory while this routine is ** running (or, indeed, while the WalIterator object exists). Hence, ** we can cast off the volatile qualifacation from shared memory */ assert( pWal->lockState==SQLITE_SHM_CHECKPOINT ); aData = (u32*)pWal->pWiData; /* Allocate space for the WalIterator object */ iLast = pWal->hdr.iLastPg; nSegment = (iLast >> 8) + 1; nFinal = (iLast & 0x000000FF); nByte = sizeof(WalIterator) + (nSegment+1)*(sizeof(struct WalSegment)+256); p = (WalIterator *)sqlite3_malloc(nByte); if( !p ){ return SQLITE_NOMEM; } memset(p, 0, nByte); /* Initialize the WalIterator object. Each 256-entry segment is ** presorted in order to make iterating through all entries much ** faster. */ p->nSegment = nSegment; aSpace = (u8 *)&p->aSegment[nSegment]; aTmp = &aSpace[nSegment*256]; for(i=0; i<nSegment; i++){ int j; int nIndex = (i==nSegment-1) ? nFinal : 256; p->aSegment[i].aPgno = &aData[walIndexEntry(i*256+1)]; p->aSegment[i].aIndex = aSpace; for(j=0; j<nIndex; j++){ aSpace[j] = j; } walMergesort8(p->aSegment[i].aPgno, aTmp, aSpace, &nIndex); memset(&aSpace[nIndex], aSpace[nIndex-1], 256-nIndex); aSpace += 256; p->nFinal = nIndex; } /* Return the fully initializd WalIterator object */ *pp = p; return SQLITE_OK ; } /* ** Free an iterator allocated by walIteratorInit(). */ static void walIteratorFree(WalIterator *p){ sqlite3_free(p); } /* ** Checkpoint the contents of the log file. |
︙ | ︙ | |||
920 921 922 923 924 925 926 | } sqlite3_free(pWal); } return rc; } /* | | > > | > > > > > | < | > > | > > > | | | > > > > > > > > > > | 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 | } sqlite3_free(pWal); } return rc; } /* ** Try to read the wal-index header. Return 0 on success and 1 if ** there is a problem. ** ** The wal-index is in shared memory. Another thread or process might ** be writing the header at the same time this procedure is trying to ** read it, which might result in inconsistency. A dirty read is detected ** by verifying a checksum on the header. ** ** If and only if the read is consistent and the header is different from ** pWal->hdr, then pWal->hdr is updated to the content of the new header ** and *pChanged is set to 1. ** ** If the checksum cannot be verified return non-zero. If the header ** is read successfully and the checksum verified, return zero. */ int walIndexTryHdr(Wal *pWal, int *pChanged){ int i; volatile u32 *aWiData; u32 aCksum[2] = {1, 1}; u32 aHdr[WALINDEX_HDR_NFIELD+2]; assert( pWal->pWiData ); if( pWal->szWIndex==0 ){ /* The wal-index is of size 0 bytes. This is handled in the same way ** as an invalid header. The caller will run recovery to construct ** a valid wal-index file before accessing the database. */ return 1; } /* Read the header. The caller may or may not have an exclusive ** (WRITE, PENDING, CHECKPOINT or RECOVER) lock on the wal-index ** file, meaning it is possible that an inconsistent snapshot is read ** from the file. If this happens, return non-zero. */ aWiData = pWal->pWiData; for(i=0; i<WALINDEX_HDR_NFIELD+2; i++){ aHdr[i] = aWiData[i]; } walChecksumBytes((u8*)aHdr, sizeof(u32)*WALINDEX_HDR_NFIELD, aCksum); if( aCksum[0]!=aHdr[WALINDEX_HDR_NFIELD] || aCksum[1]!=aHdr[WALINDEX_HDR_NFIELD+1] ){ return 1; } if( memcmp(&pWal->hdr, aHdr, sizeof(WalIndexHdr)) ){ *pChanged = 1; memcpy(&pWal->hdr, aHdr, sizeof(WalIndexHdr)); } /* The header was successfully read. Return zero. */ return 0; } /* ** Read the wal-index header from the wal-index and into pWal->hdr. ** If the wal-header appears to be corrupt, try to recover the log ** before returning. ** ** Set *pChanged to 1 if the wal-index header value in pWal->hdr is ** changed by this opertion. If pWal->hdr is unchanged, set *pChanged ** to 0. ** ** This routine also maps the wal-index content into memory and assigns ** ownership of that mapping to the current thread. In some implementations, ** only one thread at a time can hold a mapping of the wal-index. Hence, ** the caller should strive to invoke walIndexUnmap() as soon as possible ** after this routine returns. ** ** If the wal-index header is successfully read, return SQLITE_OK. ** Otherwise an SQLite error code. */ static int walIndexReadHdr(Wal *pWal, int *pChanged){ int rc; /* Return code */ int lockState; /* pWal->lockState before running recovery */ |
︙ | ︙ | |||
1028 1029 1030 1031 1032 1033 1034 | } } return rc; } /* | > > > > > | > > > > | 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 | } } return rc; } /* ** Take a snapshot of the state of the WAL and wal-index for the current ** instant in time. The current thread will continue to use this snapshot. ** Other threads might containing appending to the WAL and wal-index but ** the extra content appended will be ignored by the current thread. ** ** A snapshot is like a read transaction. ** ** No other threads are allowed to run a checkpoint while this thread is ** holding the snapshot since a checkpoint would remove data out from under ** this thread. ** ** If this call obtains a new read-lock and the database contents have been ** modified since the most recent call to WalCloseSnapshot() on this Wal ** connection, then *pChanged is set to 1 before returning. Otherwise, it ** is left unmodified. This is used by the pager layer to determine whether ** or not any cached pages may be safely reused. */ |
︙ | ︙ | |||
1315 1316 1317 1318 1319 1320 1321 | int nLast = 0; /* Number of extra copies of last page */ assert( WAL_FRAME_HDRSIZE==(4 * 2 + 2*sizeof(u32)) ); assert( pList ); assert( pWal->lockState==SQLITE_SHM_WRITE ); assert( pWal->pWiData==0 ); | | | | | 1396 1397 1398 1399 1400 1401 1402 1403 1404 1405 1406 1407 1408 1409 1410 1411 1412 | int nLast = 0; /* Number of extra copies of last page */ assert( WAL_FRAME_HDRSIZE==(4 * 2 + 2*sizeof(u32)) ); assert( pList ); assert( pWal->lockState==SQLITE_SHM_WRITE ); assert( pWal->pWiData==0 ); /* If this is the first frame written into the log, write the WAL ** header to the start of the WAL file. See comments at the top of ** this source file for a description of the WAL header format. */ assert( WAL_FRAME_HDRSIZE>=WAL_HDRSIZE ); iFrame = pWal->hdr.iLastPg; if( iFrame==0 ){ sqlite3Put4byte(aFrame, nPgsz); sqlite3_randomness(8, &aFrame[4]); pWal->hdr.iCheck1 = sqlite3Get4byte(&aFrame[4]); |
︙ | ︙ | |||
1351 1352 1353 1354 1355 1356 1357 | walEncodeFrame(aCksum, p->pgno, nDbsize, nPgsz, p->pData, aFrame); rc = sqlite3OsWrite(pWal->pWalFd, aFrame, sizeof(aFrame), iOffset); if( rc!=SQLITE_OK ){ return rc; } /* Write the page data */ | | | 1432 1433 1434 1435 1436 1437 1438 1439 1440 1441 1442 1443 1444 1445 1446 | walEncodeFrame(aCksum, p->pgno, nDbsize, nPgsz, p->pData, aFrame); rc = sqlite3OsWrite(pWal->pWalFd, aFrame, sizeof(aFrame), iOffset); if( rc!=SQLITE_OK ){ return rc; } /* Write the page data */ rc = sqlite3OsWrite(pWal->pWalFd, p->pData, nPgsz, iOffset+sizeof(aFrame)); if( rc!=SQLITE_OK ){ return rc; } pLast = p; } /* Sync the log file if the 'isSync' flag was specified. */ |
︙ | ︙ | |||
1388 1389 1390 1391 1392 1393 1394 | } rc = sqlite3OsSync(pWal->pWalFd, sync_flags); } assert( pWal->pWiData==0 ); /* Append data to the wal-index. It is not necessary to lock the | | | 1469 1470 1471 1472 1473 1474 1475 1476 1477 1478 1479 1480 1481 1482 1483 | } rc = sqlite3OsSync(pWal->pWalFd, sync_flags); } assert( pWal->pWiData==0 ); /* Append data to the wal-index. It is not necessary to lock the ** wal-index to do this as the SQLITE_SHM_WRITE lock held on the wal-index ** guarantees that there are no other writers, and no data that may ** be in use by existing readers is being overwritten. */ iFrame = pWal->hdr.iLastPg; for(p=pList; p && rc==SQLITE_OK; p=p->pDirty){ iFrame++; rc = walIndexAppend(pWal, iFrame, p->pgno); |
︙ | ︙ | |||
1472 1473 1474 1475 1476 1477 1478 | /* Copy data from the log to the database file. */ rc = walIndexReadHdr(pWal, &isChanged); if( rc==SQLITE_OK ){ rc = walCheckpoint(pWal, sync_flags, nBuf, zBuf); } if( isChanged ){ /* If a new wal-index header was loaded before the checkpoint was | | | 1553 1554 1555 1556 1557 1558 1559 1560 1561 1562 1563 1564 1565 1566 1567 | /* Copy data from the log to the database file. */ rc = walIndexReadHdr(pWal, &isChanged); if( rc==SQLITE_OK ){ rc = walCheckpoint(pWal, sync_flags, nBuf, zBuf); } if( isChanged ){ /* If a new wal-index header was loaded before the checkpoint was ** performed, then the pager-cache associated with pWal is now ** out of date. So zero the cached wal-index header to ensure that ** next time the pager opens a snapshot on this database it knows that ** the cache needs to be reset. */ memset(&pWal->hdr, 0, sizeof(WalIndexHdr)); } |
︙ | ︙ |