Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Overview
Comment: | Change the way wal2 locks work to ensure a reader only ever has to lock a single slot. |
---|---|
Downloads: | Tarball | ZIP archive |
Timelines: | family | ancestors | descendants | both | wal2 |
Files: | files | file ages | folders |
SHA3-256: |
18b2c23ac53d985ccc5798ea2d92fb75 |
User & Date: | dan 2018-12-11 17:56:23.255 |
Context
2018-12-12
| ||
19:04 | Add tests to ensure that each of the 4 wal read-locks does what it is supposed to. (check-in: 4d5779f31d user: dan tags: wal2) | |
2018-12-11
| ||
17:56 | Change the way wal2 locks work to ensure a reader only ever has to lock a single slot. (check-in: 18b2c23ac5 user: dan tags: wal2) | |
13:44 | Merge latest trunk changes into this branch. (check-in: d8dd98a39e user: dan tags: wal2) | |
Changes
Changes to src/wal.c.
︙ | ︙ | |||
358 359 360 361 362 363 364 | ** it is held, but does not prevent a checkpointer from checkpointing ** it. ** ** There is still a single WRITER and a single CHECKPOINTER lock. The ** recovery procedure still takes the same exclusive lock on the entire ** range of SQLITE_SHM_NLOCK shm-locks. This works because the read-locks ** above use four of the six read-locking slots used by legacy wal mode. | < | 358 359 360 361 362 363 364 365 366 367 368 369 370 371 | ** it is held, but does not prevent a checkpointer from checkpointing ** it. ** ** There is still a single WRITER and a single CHECKPOINTER lock. The ** recovery procedure still takes the same exclusive lock on the entire ** range of SQLITE_SHM_NLOCK shm-locks. This works because the read-locks ** above use four of the six read-locking slots used by legacy wal mode. ** ** STARTUP/RECOVERY ** ** The read and write version fields of the database header in a wal2 ** database are set to 0x03, instead of 0x02 as in legacy wal mode. ** ** The wal file format used in wal2 mode is the same as the format used |
︙ | ︙ | |||
469 470 471 472 473 474 475 | /* ** Values that may be stored in Wal.readLock in wal2 mode. ** ** In wal mode, the Wal.readLock member is set to -1 when no read-lock ** is held, or else is the index of the read-mark on which a lock is ** held. ** | < | > | > > > > | | | | 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 | /* ** Values that may be stored in Wal.readLock in wal2 mode. ** ** In wal mode, the Wal.readLock member is set to -1 when no read-lock ** is held, or else is the index of the read-mark on which a lock is ** held. ** ** In wal2 mode, a value of -1 still indicates that no read-lock is held. ** And a non-zero value still represents the index of the read-mark on ** which a lock is held. There are two differences: ** ** 1. wal2 mode never uses read-mark 0. ** ** 2. locks on each read-mark have a different interpretation, as ** indicated by the symbolic names below. */ #define WAL_LOCK_NONE -1 #define WAL_LOCK_PART1 1 #define WAL_LOCK_PART1_FULL2 2 #define WAL_LOCK_PART2_FULL1 3 #define WAL_LOCK_PART2 4 /* ** This constant is used in wal2 mode only. ** ** In wal2 mode, when committing a transaction, if the current wal file ** is sufficiently large and there are no conflicting locks held, the ** writer writes the new transaction into the start of the other wal |
︙ | ︙ | |||
1107 1108 1109 1110 1111 1112 1113 | if( pWal->exclusiveMode ) return; (void)sqlite3OsShmLock(pWal->pDbFd, lockIdx, n, SQLITE_SHM_UNLOCK | SQLITE_SHM_EXCLUSIVE); WALTRACE(("WAL%p: release EXCLUSIVE-%s cnt=%d\n", pWal, walLockName(lockIdx), n)); } | < < < < < < < < < < < < < < < < < < < < < < < < < < < < < < | 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 | if( pWal->exclusiveMode ) return; (void)sqlite3OsShmLock(pWal->pDbFd, lockIdx, n, SQLITE_SHM_UNLOCK | SQLITE_SHM_EXCLUSIVE); WALTRACE(("WAL%p: release EXCLUSIVE-%s cnt=%d\n", pWal, walLockName(lockIdx), n)); } /* ** Compute a hash on a page number. The resulting hash value must land ** between 0 and (HASHTABLE_NSLOT-1). The walHashNext() function advances ** the hash to the next value in the event of a collision. */ static int walHash(u32 iPage){ assert( iPage>0 ); |
︙ | ︙ | |||
2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 | walIndexWriteHdr(pWal); pInfo->nBackfill = 0; pInfo->nBackfillAttempted = 0; pInfo->aReadMark[1] = 0; for(i=2; i<WAL_NREADER; i++) pInfo->aReadMark[i] = READMARK_NOT_USED; assert( pInfo->aReadMark[0]==0 ); } /* ** Copy as much content as we can from the WAL back into the database file ** in response to an sqlite3_wal_checkpoint() request or the equivalent. ** ** The amount of information copies from WAL to database might be limited ** by active readers. This routine will never overwrite a database page | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 | walIndexWriteHdr(pWal); pInfo->nBackfill = 0; pInfo->nBackfillAttempted = 0; pInfo->aReadMark[1] = 0; for(i=2; i<WAL_NREADER; i++) pInfo->aReadMark[i] = READMARK_NOT_USED; assert( pInfo->aReadMark[0]==0 ); } /* ** This function is used in wal2 mode. ** ** This function is called when writer pWal is just about to start ** writing out frames. Parameter iApp is the current wal file. The "other" wal ** file (wal file !iApp) has been fully checkpointed. This function returns ** SQLITE_OK if there are no readers preventing the writer from switching to ** the other wal file. Or SQLITE_BUSY if there are. */ static int wal2RestartOk(Wal *pWal, int iApp){ /* The other wal file (wal file !iApp) can be overwritten if there ** are no readers reading from it - no "full" or "partial" locks. ** Technically speaking it is not possible for any reader to hold ** a "part" lock, as this would have prevented the file from being ** checkpointed. But checking anyway doesn't hurt. The following ** is equivalent to: ** ** if( iApp==0 ) eLock = WAL_LOCK_PART1_FULL2; ** if( iApp==1 ) eLock = WAL_LOCK_PART1; */ int eLock = 1 + (iApp==0); assert( WAL_LOCK_PART1==1 ); assert( WAL_LOCK_PART1_FULL2==2 ); assert( WAL_LOCK_PART2_FULL1==3 ); assert( WAL_LOCK_PART2==4 ); assert( iApp!=0 || eLock==WAL_LOCK_PART1_FULL2 ); assert( iApp!=1 || eLock==WAL_LOCK_PART1 ); return walLockExclusive(pWal, WAL_READ_LOCK(eLock), 3); } static void wal2RestartFinished(Wal *pWal, int iApp){ walUnlockExclusive(pWal, WAL_READ_LOCK(1 + (iApp==0)), 3); } /* ** This function is used in wal2 mode. ** ** This function is called when a checkpointer wishes to checkpoint wal ** file iCkpt. It takes the required lock and, if successful, returns ** SQLITE_OK. Otherwise, an SQLite error code (e.g. SQLITE_BUSY). If this ** function returns SQLITE_OK, it is the responsibility of the caller ** to invoke wal2CheckpointFinished() to release the lock. */ static int wal2CheckpointOk(Wal *pWal, int iCkpt){ int eLock = 1 + (iCkpt*2); assert( WAL_LOCK_PART1==1 ); assert( WAL_LOCK_PART1_FULL2==2 ); assert( WAL_LOCK_PART2_FULL1==3 ); assert( WAL_LOCK_PART2==4 ); assert( iCkpt!=0 || eLock==WAL_LOCK_PART1 ); assert( iCkpt!=1 || eLock==WAL_LOCK_PART2_FULL1 ); return walLockExclusive(pWal, WAL_READ_LOCK(eLock), 2); } static void wal2CheckpointFinished(Wal *pWal, int iCkpt){ walUnlockExclusive(pWal, WAL_READ_LOCK(1 + (iCkpt*2)), 2); } /* ** Copy as much content as we can from the WAL back into the database file ** in response to an sqlite3_wal_checkpoint() request or the equivalent. ** ** The amount of information copies from WAL to database might be limited ** by active readers. This routine will never overwrite a database page |
︙ | ︙ | |||
2314 2315 2316 2317 2318 2319 2320 | sqlite3_file *pWalFd = pWal->apWalFd[iCkpt]; mxPage = pWal->hdr.nPage; /* If this is a wal2 system, check for a reader holding a lock ** preventing this checkpoint operation. If one is found, return ** early. */ if( bWal2 ){ | | | 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 | sqlite3_file *pWalFd = pWal->apWalFd[iCkpt]; mxPage = pWal->hdr.nPage; /* If this is a wal2 system, check for a reader holding a lock ** preventing this checkpoint operation. If one is found, return ** early. */ if( bWal2 ){ rc = wal2CheckpointOk(pWal, iCkpt); if( rc!=SQLITE_OK ) return rc; } /* EVIDENCE-OF: R-62920-47450 The busy-handler callback is never invoked ** in the SQLITE_CHECKPOINT_PASSIVE mode. */ assert( eMode!=SQLITE_CHECKPOINT_PASSIVE || xBusy==0 ); |
︙ | ︙ | |||
2363 2364 2365 2366 2367 2368 2369 | /* Allocate the iterator */ if( bWal2 || pInfo->nBackfill<mxSafeFrame ){ assert( bWal2==0 || pInfo->nBackfill==0 ); rc = walIteratorInit(pWal, iCkpt, pInfo->nBackfill, &pIter); assert( rc==SQLITE_OK || pIter==0 ); } | | | | | 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 | /* Allocate the iterator */ if( bWal2 || pInfo->nBackfill<mxSafeFrame ){ assert( bWal2==0 || pInfo->nBackfill==0 ); rc = walIteratorInit(pWal, iCkpt, pInfo->nBackfill, &pIter); assert( rc==SQLITE_OK || pIter==0 ); } if( pIter && (bWal2 || (rc = walBusyLock(pWal, xBusy, pBusyArg,WAL_READ_LOCK(0),1))==SQLITE_OK )){ u32 nBackfill = pInfo->nBackfill; assert( bWal2==0 || nBackfill==0 ); pInfo->nBackfillAttempted = mxSafeFrame; /* Sync the wal file being checkpointed to disk */ rc = sqlite3OsSync(pWalFd, CKPT_SYNC_FLAGS(sync_flags)); |
︙ | ︙ | |||
2430 2431 2432 2433 2434 2435 2436 | } } if( rc==SQLITE_OK ){ pInfo->nBackfill = bWal2 ? 1 : mxSafeFrame; } /* Release the reader lock held while backfilling */ | > | > > | 2465 2466 2467 2468 2469 2470 2471 2472 2473 2474 2475 2476 2477 2478 2479 2480 2481 2482 2483 2484 2485 2486 2487 2488 2489 | } } if( rc==SQLITE_OK ){ pInfo->nBackfill = bWal2 ? 1 : mxSafeFrame; } /* Release the reader lock held while backfilling */ if( bWal2==0 ){ walUnlockExclusive(pWal, WAL_READ_LOCK(0), 1); } } if( rc==SQLITE_BUSY ){ /* Reset the return code so as not to report a checkpoint failure ** just because there are active readers. */ rc = SQLITE_OK; } if( bWal2 ) wal2CheckpointFinished(pWal, iCkpt); } /* If this is an SQLITE_CHECKPOINT_RESTART or TRUNCATE operation, and the ** entire wal file has been copied into the database file, then block ** until all readers have finished using the wal file. This ensures that ** the next process to write to the database restarts the wal file. */ |
︙ | ︙ | |||
3056 3057 3058 3059 3060 3061 3062 | } } assert( pWal->nWiData>0 ); assert( pWal->apWiData[0]!=0 ); pInfo = walCkptInfo(pWal); if( isWalMode2(pWal) ){ | > > > | | > | > > > > > > | | < | | 3094 3095 3096 3097 3098 3099 3100 3101 3102 3103 3104 3105 3106 3107 3108 3109 3110 3111 3112 3113 3114 3115 3116 3117 3118 3119 3120 3121 3122 3123 3124 3125 3126 3127 3128 | } } assert( pWal->nWiData>0 ); assert( pWal->apWiData[0]!=0 ); pInfo = walCkptInfo(pWal); if( isWalMode2(pWal) ){ /* This connection needs a "part" lock on the current wal file and, ** unless pInfo->nBackfill is set to indicate that it has already been ** checkpointed, a "full" lock on the other wal file. */ int iWal = walidxGetFile(&pWal->hdr); int nBackfill = pInfo->nBackfill || walidxGetMxFrame(&pWal->hdr, !iWal)==0; int eLock = 1 + (iWal*2) + (nBackfill==iWal); assert( nBackfill==0 || nBackfill==1 ); assert( iWal==0 || iWal==1 ); assert( iWal!=0 || nBackfill!=1 || eLock==WAL_LOCK_PART1 ); assert( iWal!=0 || nBackfill!=0 || eLock==WAL_LOCK_PART1_FULL2 ); assert( iWal!=1 || nBackfill!=1 || eLock==WAL_LOCK_PART2 ); assert( iWal!=1 || nBackfill!=0 || eLock==WAL_LOCK_PART2_FULL1 ); rc = walLockShared(pWal, WAL_READ_LOCK(eLock)); if( rc!=SQLITE_OK ){ return rc; } walShmBarrier(pWal); if( memcmp((void *)walIndexHdr(pWal), &pWal->hdr, sizeof(WalIndexHdr)) ){ walUnlockShared(pWal, WAL_READ_LOCK(eLock)); return WAL_RETRY; }else{ pWal->readLock = eLock; } assert( pWal->minFrame==0 && walFramePage(pWal->minFrame)==0 ); }else{ u32 mxReadMark; /* Largest aReadMark[] value */ |
︙ | ︙ | |||
3399 3400 3401 3402 3403 3404 3405 | /* ** Finish with a read transaction. All this does is release the ** read-lock. */ void sqlite3WalEndReadTransaction(Wal *pWal){ sqlite3WalEndWriteTransaction(pWal); if( pWal->readLock!=WAL_LOCK_NONE ){ | < < < | < | 3446 3447 3448 3449 3450 3451 3452 3453 3454 3455 3456 3457 3458 3459 3460 | /* ** Finish with a read transaction. All this does is release the ** read-lock. */ void sqlite3WalEndReadTransaction(Wal *pWal){ sqlite3WalEndWriteTransaction(pWal); if( pWal->readLock!=WAL_LOCK_NONE ){ walUnlockShared(pWal, WAL_READ_LOCK(pWal->readLock)); pWal->readLock = WAL_LOCK_NONE; } } /* Search hash table iHash for an entry matching page number ** pgno. Each call to this function searches a single hash table ** (each hash table indexes up to HASHTABLE_NPAGE frames). |
︙ | ︙ | |||
3790 3791 3792 3793 3794 3795 3796 | pWal->hdr.aFrameCksum[1] = aWalData[2]; walCleanupHash(pWal); } return rc; } | < < < < < < < < < < < < < < < < < < < < < < < < < < < | 3833 3834 3835 3836 3837 3838 3839 3840 3841 3842 3843 3844 3845 3846 | pWal->hdr.aFrameCksum[1] = aWalData[2]; walCleanupHash(pWal); } return rc; } /* ** This function is called just before writing a set of frames to the log ** file (see sqlite3WalFrames()). It checks to see if, instead of appending ** to the current log file, it is possible and desirable to switch to the ** other log file and write the new transaction to the start of it. ** If so, the wal-index header is updated accordingly - both in heap memory ** and in the *-shm file. |
︙ | ︙ | |||
3844 3845 3846 3847 3848 3849 3850 | / (pWal->szPage+WAL_FRAME_HDRSIZE); nWalSize = MAX(nWalSize, 1); } if( walidxGetMxFrame(&pWal->hdr, iApp)>=nWalSize ){ volatile WalCkptInfo *pInfo = walCkptInfo(pWal); if( walidxGetMxFrame(&pWal->hdr, !iApp)==0 || pInfo->nBackfill ){ | | | | | > | | | | 3860 3861 3862 3863 3864 3865 3866 3867 3868 3869 3870 3871 3872 3873 3874 3875 3876 3877 3878 3879 3880 3881 3882 3883 3884 3885 3886 3887 | / (pWal->szPage+WAL_FRAME_HDRSIZE); nWalSize = MAX(nWalSize, 1); } if( walidxGetMxFrame(&pWal->hdr, iApp)>=nWalSize ){ volatile WalCkptInfo *pInfo = walCkptInfo(pWal); if( walidxGetMxFrame(&pWal->hdr, !iApp)==0 || pInfo->nBackfill ){ rc = wal2RestartOk(pWal, iApp); if( rc==SQLITE_OK ){ int iNew = !iApp; pWal->nCkpt++; walidxSetFile(&pWal->hdr, iNew); walidxSetMxFrame(&pWal->hdr, iNew, 0); sqlite3Put4byte((u8*)&pWal->hdr.aSalt[0], pWal->hdr.aFrameCksum[0]); sqlite3Put4byte((u8*)&pWal->hdr.aSalt[1], pWal->hdr.aFrameCksum[1]); walIndexWriteHdr(pWal); pInfo->nBackfill = 0; wal2RestartFinished(pWal, iApp); walUnlockShared(pWal, WAL_READ_LOCK(pWal->readLock)); pWal->readLock = iNew ? WAL_LOCK_PART2_FULL1 : WAL_LOCK_PART1_FULL2; rc = walLockShared(pWal, WAL_READ_LOCK(pWal->readLock)); }else if( rc==SQLITE_BUSY ){ rc = SQLITE_OK; } } } }else if( pWal->readLock==0 ){ int cnt; |
︙ | ︙ | |||
4486 4487 4488 4489 4490 4491 4492 | */ assert( pWal->readLock!=WAL_LOCK_NONE || pWal->lockError ); assert( pWal->readLock!=WAL_LOCK_NONE || (op<=0 && pWal->exclusiveMode==0) ); if( op==0 ){ if( pWal->exclusiveMode ){ pWal->exclusiveMode = WAL_NORMAL_MODE; | < < < | < < < < | < | 4503 4504 4505 4506 4507 4508 4509 4510 4511 4512 4513 4514 4515 4516 4517 4518 4519 4520 4521 4522 4523 4524 4525 4526 4527 4528 4529 | */ assert( pWal->readLock!=WAL_LOCK_NONE || pWal->lockError ); assert( pWal->readLock!=WAL_LOCK_NONE || (op<=0 && pWal->exclusiveMode==0) ); if( op==0 ){ if( pWal->exclusiveMode ){ pWal->exclusiveMode = WAL_NORMAL_MODE; rc = walLockShared(pWal, WAL_READ_LOCK(pWal->readLock)); if( rc!=SQLITE_OK ){ pWal->exclusiveMode = WAL_EXCLUSIVE_MODE; } rc = pWal->exclusiveMode==WAL_NORMAL_MODE; }else{ /* Already in locking_mode=NORMAL */ rc = 0; } }else if( op>0 ){ assert( pWal->exclusiveMode==WAL_NORMAL_MODE ); assert( pWal->readLock>=0 ); walUnlockShared(pWal, WAL_READ_LOCK(pWal->readLock)); pWal->exclusiveMode = WAL_EXCLUSIVE_MODE; rc = 1; }else{ rc = pWal->exclusiveMode==WAL_NORMAL_MODE; } return rc; } |
︙ | ︙ |
Changes to test/wal2simple.test.
︙ | ︙ | |||
239 240 241 242 243 244 245 | do_test 6.1 { for {set i 0} {$i < 10} {incr i} { execsql "CREATE TABLE t$i (x);" } } {} | < < | 239 240 241 242 243 244 245 246 247 248 249 250 251 252 | do_test 6.1 { for {set i 0} {$i < 10} {incr i} { execsql "CREATE TABLE t$i (x);" } } {} do_test 6.2.1 { foreach f [glob -nocomplain test.db2*] { forcedelete $f } forcecopy test.db-wal2 test.db2-wal2 sqlite3 db2 test.db2 db2 eval { SELECT * FROM sqlite_master } } {} do_test 6.2.2 { |
︙ | ︙ |