Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Overview
Comment: | Add the nBackfillAttempted field in formerly unused space in WalCkptInfo and use that field to close the race condition on opening a snapshot. |
---|---|
Downloads: | Tarball | ZIP archive |
Timelines: | family | ancestors | descendants | both | snapshot-get |
Files: | files | file ages | folders |
SHA1: |
cb68e9d0738fc7db7316947b4d2aab91 |
User & Date: | drh 2015-12-10 02:15:03.333 |
Context
2015-12-10
| ||
03:16 | Fix spacing typo in comment. No changes to code. (check-in: 3a18526fc2 user: mistachkin tags: snapshot-get) | |
02:15 | Add the nBackfillAttempted field in formerly unused space in WalCkptInfo and use that field to close the race condition on opening a snapshot. (check-in: cb68e9d073 user: drh tags: snapshot-get) | |
2015-12-09
| ||
20:05 | Update sqlite3_snapshot_open() to reduce the chances of reading a corrupt snapshot created by a checkpointer process exiting unexpectedly. (check-in: 7315f7cbf4 user: dan tags: snapshot-get) | |
Changes
Changes to src/wal.c.
︙ | ︙ | |||
268 269 270 271 272 273 274 | ** returns SQLITE_CANTOPEN. */ #define WAL_MAX_VERSION 3007000 #define WALINDEX_MAX_VERSION 3007000 /* ** Indices of various locking bytes. WAL_NREADER is the number | | > | > > > | 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 | ** returns SQLITE_CANTOPEN. */ #define WAL_MAX_VERSION 3007000 #define WALINDEX_MAX_VERSION 3007000 /* ** Indices of various locking bytes. WAL_NREADER is the number ** of available reader locks and should be at least 3. The default ** is SQLITE_SHM_NLOCK==8 and WAL_NREADER==5. */ #define WAL_WRITE_LOCK 0 #define WAL_ALL_BUT_WRITE 1 #define WAL_CKPT_LOCK 1 #define WAL_RECOVER_LOCK 2 #define WAL_READ_LOCK(I) (3+(I)) #define WAL_NREADER (SQLITE_SHM_NLOCK-3) /* Object declarations */ typedef struct WalIndexHdr WalIndexHdr; typedef struct WalIterator WalIterator; typedef struct WalCkptInfo WalCkptInfo; /* ** The following object holds a copy of the wal-index header content. ** ** The actual header in the wal-index consists of two copies of this ** object followed by one instance of the WalCkptInfo object. ** For all versions of SQLite through 3.10.0 and probably beyond, ** the locking bytes (WalCkptInfo.aLock) start at offset 120 and ** the total header size is 136 bytes. ** ** The szPage value can be any power of 2 between 512 and 32768, inclusive. ** Or it can be 1 to represent a 65536-byte page. The latter case was ** added in 3.7.1 when support for 64K pages was added. */ struct WalIndexHdr { u32 iVersion; /* Wal-index version */ |
︙ | ︙ | |||
320 321 322 323 324 325 326 327 328 329 330 331 332 333 | ** nBackfill is the number of frames in the WAL that have been written ** back into the database. (We call the act of moving content from WAL to ** database "backfilling".) The nBackfill number is never greater than ** WalIndexHdr.mxFrame. nBackfill can only be increased by threads ** holding the WAL_CKPT_LOCK lock (which includes a recovery thread). ** However, a WAL_WRITE_LOCK thread can move the value of nBackfill from ** mxFrame back to zero when the WAL is reset. ** ** There is one entry in aReadMark[] for each reader lock. If a reader ** holds read-lock K, then the value in aReadMark[K] is no greater than ** the mxFrame for that reader. The value READMARK_NOT_USED (0xffffffff) ** for any aReadMark[] means that entry is unused. aReadMark[0] is ** a special case; its value is never used and it exists as a place-holder ** to avoid having to offset aReadMark[] indexs by one. Readers holding | > > > > > > > > > > | 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 | ** nBackfill is the number of frames in the WAL that have been written ** back into the database. (We call the act of moving content from WAL to ** database "backfilling".) The nBackfill number is never greater than ** WalIndexHdr.mxFrame. nBackfill can only be increased by threads ** holding the WAL_CKPT_LOCK lock (which includes a recovery thread). ** However, a WAL_WRITE_LOCK thread can move the value of nBackfill from ** mxFrame back to zero when the WAL is reset. ** ** nBackfillAttempted is the largest value of nBackfill that a checkpoint ** has attempted to achieve. Normally nBackfill==nBackfillAtempted, however ** the nBackfillAttempted is set before any backfilling is done and the ** nBackfill is only set afte rall backfilling completes. So if a checkpoint ** crashes, nBackfillAttempted might be larger than nBackfill. The ** WalIndexHdr.mxFrame must never be less than nBackfillAttempted. ** ** The aLock[] field is a set of bytes used for locking. These bytes should ** never be read or written. ** ** There is one entry in aReadMark[] for each reader lock. If a reader ** holds read-lock K, then the value in aReadMark[K] is no greater than ** the mxFrame for that reader. The value READMARK_NOT_USED (0xffffffff) ** for any aReadMark[] means that entry is unused. aReadMark[0] is ** a special case; its value is never used and it exists as a place-holder ** to avoid having to offset aReadMark[] indexs by one. Readers holding |
︙ | ︙ | |||
360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 | ** ** We assume that 32-bit loads are atomic and so no locks are needed in ** order to read from any aReadMark[] entries. */ struct WalCkptInfo { u32 nBackfill; /* Number of WAL frames backfilled into DB */ u32 aReadMark[WAL_NREADER]; /* Reader marks */ }; #define READMARK_NOT_USED 0xffffffff /* A block of WALINDEX_LOCK_RESERVED bytes beginning at ** WALINDEX_LOCK_OFFSET is reserved for locks. Since some systems ** only support mandatory file-locks, we do not read or write data ** from the region of the file on which locks are applied. */ | > > > < | | | 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 | ** ** We assume that 32-bit loads are atomic and so no locks are needed in ** order to read from any aReadMark[] entries. */ struct WalCkptInfo { u32 nBackfill; /* Number of WAL frames backfilled into DB */ u32 aReadMark[WAL_NREADER]; /* Reader marks */ u8 aLock[SQLITE_SHM_NLOCK]; /* Reserved space for locks */ u32 nBackfillAttempted; /* WAL frames perhaps written, or maybe not */ u32 notUsed0; /* Available for future enhancements */ }; #define READMARK_NOT_USED 0xffffffff /* A block of WALINDEX_LOCK_RESERVED bytes beginning at ** WALINDEX_LOCK_OFFSET is reserved for locks. Since some systems ** only support mandatory file-locks, we do not read or write data ** from the region of the file on which locks are applied. */ #define WALINDEX_LOCK_OFFSET (sizeof(WalIndexHdr)*2+offsetof(WalCkptInfo,aLock)) #define WALINDEX_HDR_SIZE (sizeof(WalIndexHdr)*2+sizeof(WalCkptInfo)) /* Size of header before each frame in wal */ #define WAL_FRAME_HDRSIZE 24 /* Size of write ahead log header, including checksum. */ /* #define WAL_HDRSIZE 24 */ #define WAL_HDRSIZE 32 |
︙ | ︙ | |||
431 432 433 434 435 436 437 | u32 minFrame; /* Ignore wal frames before this one */ const char *zWalName; /* Name of WAL file */ u32 nCkpt; /* Checkpoint sequence counter in the wal-header */ #ifdef SQLITE_DEBUG u8 lockError; /* True if a locking error has occurred */ #endif #ifdef SQLITE_ENABLE_SNAPSHOT | | | 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 | u32 minFrame; /* Ignore wal frames before this one */ const char *zWalName; /* Name of WAL file */ u32 nCkpt; /* Checkpoint sequence counter in the wal-header */ #ifdef SQLITE_DEBUG u8 lockError; /* True if a locking error has occurred */ #endif #ifdef SQLITE_ENABLE_SNAPSHOT WalIndexHdr *pSnapshot; /* Start transaction here if not NULL */ #endif }; /* ** Candidate values for Wal.exclusiveMode. */ #define WAL_NORMAL_MODE 0 |
︙ | ︙ | |||
1197 1198 1199 1200 1201 1202 1203 1204 1205 1206 1207 1208 1209 1210 | /* Reset the checkpoint-header. This is safe because this thread is ** currently holding locks that exclude all other readers, writers and ** checkpointers. */ pInfo = walCkptInfo(pWal); pInfo->nBackfill = 0; pInfo->aReadMark[0] = 0; for(i=1; i<WAL_NREADER; i++) pInfo->aReadMark[i] = READMARK_NOT_USED; if( pWal->hdr.mxFrame ) pInfo->aReadMark[1] = pWal->hdr.mxFrame; /* If more than one frame was recovered from the log file, report an ** event via sqlite3_log(). This is to help with identifying performance ** problems caused by applications routinely shutting down without | > | 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 | /* Reset the checkpoint-header. This is safe because this thread is ** currently holding locks that exclude all other readers, writers and ** checkpointers. */ pInfo = walCkptInfo(pWal); pInfo->nBackfill = 0; pInfo->nBackfillAttempted = 0; pInfo->aReadMark[0] = 0; for(i=1; i<WAL_NREADER; i++) pInfo->aReadMark[i] = READMARK_NOT_USED; if( pWal->hdr.mxFrame ) pInfo->aReadMark[1] = pWal->hdr.mxFrame; /* If more than one frame was recovered from the log file, report an ** event via sqlite3_log(). This is to help with identifying performance ** problems caused by applications routinely shutting down without |
︙ | ︙ | |||
1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 | assert( zWalName && zWalName[0] ); assert( pDbFd ); /* In the amalgamation, the os_unix.c and os_win.c source files come before ** this source file. Verify that the #defines of the locking byte offsets ** in os_unix.c and os_win.c agree with the WALINDEX_LOCK_OFFSET value. */ #ifdef WIN_SHM_BASE assert( WIN_SHM_BASE==WALINDEX_LOCK_OFFSET ); #endif #ifdef UNIX_SHM_BASE assert( UNIX_SHM_BASE==WALINDEX_LOCK_OFFSET ); #endif | > > > > | 1285 1286 1287 1288 1289 1290 1291 1292 1293 1294 1295 1296 1297 1298 1299 1300 1301 1302 1303 | assert( zWalName && zWalName[0] ); assert( pDbFd ); /* In the amalgamation, the os_unix.c and os_win.c source files come before ** this source file. Verify that the #defines of the locking byte offsets ** in os_unix.c and os_win.c agree with the WALINDEX_LOCK_OFFSET value. ** For that matter, if the lock offset ever changes from its initial design ** value of 120, we need to know that so there is an assert() to check it. */ assert( 120==WALINDEX_LOCK_OFFSET ); assert( 136==WALINDEX_HDR_SIZE ); #ifdef WIN_SHM_BASE assert( WIN_SHM_BASE==WALINDEX_LOCK_OFFSET ); #endif #ifdef UNIX_SHM_BASE assert( UNIX_SHM_BASE==WALINDEX_LOCK_OFFSET ); #endif |
︙ | ︙ | |||
1654 1655 1656 1657 1658 1659 1660 1661 1662 1663 1664 1665 1666 1667 | u32 *aSalt = pWal->hdr.aSalt; /* Big-endian salt values */ pWal->nCkpt++; pWal->hdr.mxFrame = 0; sqlite3Put4byte((u8*)&aSalt[0], 1 + sqlite3Get4byte((u8*)&aSalt[0])); memcpy(&pWal->hdr.aSalt[1], &salt1, 4); walIndexWriteHdr(pWal); pInfo->nBackfill = 0; pInfo->aReadMark[1] = 0; for(i=2; i<WAL_NREADER; i++) pInfo->aReadMark[i] = READMARK_NOT_USED; assert( pInfo->aReadMark[0]==0 ); } /* ** Copy as much content as we can from the WAL back into the database file | > | 1675 1676 1677 1678 1679 1680 1681 1682 1683 1684 1685 1686 1687 1688 1689 | u32 *aSalt = pWal->hdr.aSalt; /* Big-endian salt values */ pWal->nCkpt++; pWal->hdr.mxFrame = 0; sqlite3Put4byte((u8*)&aSalt[0], 1 + sqlite3Get4byte((u8*)&aSalt[0])); memcpy(&pWal->hdr.aSalt[1], &salt1, 4); walIndexWriteHdr(pWal); pInfo->nBackfill = 0; pInfo->nBackfillAttempted = 0; pInfo->aReadMark[1] = 0; for(i=2; i<WAL_NREADER; i++) pInfo->aReadMark[i] = READMARK_NOT_USED; assert( pInfo->aReadMark[0]==0 ); } /* ** Copy as much content as we can from the WAL back into the database file |
︙ | ︙ | |||
1731 1732 1733 1734 1735 1736 1737 1738 1739 1740 1741 1742 1743 1744 | /* Compute in mxSafeFrame the index of the last frame of the WAL that is ** safe to write into the database. Frames beyond mxSafeFrame might ** overwrite database pages that are in use by active readers and thus ** cannot be backfilled from the WAL. */ mxSafeFrame = pWal->hdr.mxFrame; mxPage = pWal->hdr.nPage; for(i=1; i<WAL_NREADER; i++){ /* Thread-sanitizer reports that the following is an unsafe read, ** as some other thread may be in the process of updating the value ** of the aReadMark[] slot. The assumption here is that if that is ** happening, the other client may only be increasing the value, ** not decreasing it. So assuming either that either the "old" or | > | 1753 1754 1755 1756 1757 1758 1759 1760 1761 1762 1763 1764 1765 1766 1767 | /* Compute in mxSafeFrame the index of the last frame of the WAL that is ** safe to write into the database. Frames beyond mxSafeFrame might ** overwrite database pages that are in use by active readers and thus ** cannot be backfilled from the WAL. */ mxSafeFrame = pWal->hdr.mxFrame; pInfo->nBackfillAttempted = mxSafeFrame; mxPage = pWal->hdr.nPage; for(i=1; i<WAL_NREADER; i++){ /* Thread-sanitizer reports that the following is an unsafe read, ** as some other thread may be in the process of updating the value ** of the aReadMark[] slot. The assumption here is that if that is ** happening, the other client may only be increasing the value, ** not decreasing it. So assuming either that either the "old" or |
︙ | ︙ | |||
2267 2268 2269 2270 2271 2272 2273 | u32 thisMark = pInfo->aReadMark[i]; if( mxReadMark<=thisMark && thisMark<=mxFrame ){ assert( thisMark!=READMARK_NOT_USED ); mxReadMark = thisMark; mxI = i; } } | < < | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | | < | 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 2309 2310 2311 2312 2313 2314 2315 2316 2317 2318 2319 2320 2321 2322 2323 2324 2325 2326 2327 2328 2329 2330 2331 2332 2333 2334 2335 2336 2337 2338 2339 2340 2341 2342 2343 2344 2345 2346 2347 2348 2349 2350 2351 2352 2353 2354 2355 2356 2357 2358 2359 2360 2361 2362 2363 2364 2365 2366 2367 2368 2369 2370 2371 2372 2373 2374 2375 2376 2377 | u32 thisMark = pInfo->aReadMark[i]; if( mxReadMark<=thisMark && thisMark<=mxFrame ){ assert( thisMark!=READMARK_NOT_USED ); mxReadMark = thisMark; mxI = i; } } if( (pWal->readOnly & WAL_SHM_RDONLY)==0 && (mxReadMark<mxFrame || mxI==0) #ifdef SQLITE_ENABLE_SNAPSHOT && pWal->pSnapshot==0 #endif ){ for(i=1; i<WAL_NREADER; i++){ rc = walLockExclusive(pWal, WAL_READ_LOCK(i), 1); if( rc==SQLITE_OK ){ mxReadMark = pInfo->aReadMark[i] = mxFrame; mxI = i; walUnlockExclusive(pWal, WAL_READ_LOCK(i), 1); break; }else if( rc!=SQLITE_BUSY ){ return rc; } } } if( mxI==0 ){ #ifdef SQLITE_ENABLE_SNAPSHOT if( pWal->pSnapshot ) return SQLITE_BUSY_SNAPSHOT; #endif assert( rc==SQLITE_BUSY || (pWal->readOnly & WAL_SHM_RDONLY)!=0 ); return rc==SQLITE_BUSY ? WAL_RETRY : SQLITE_READONLY_CANTLOCK; } rc = walLockShared(pWal, WAL_READ_LOCK(mxI)); if( rc ){ return rc==SQLITE_BUSY ? WAL_RETRY : rc; } /* Now that the read-lock has been obtained, check that neither the ** value in the aReadMark[] array or the contents of the wal-index ** header have changed. ** ** It is necessary to check that the wal-index header did not change ** between the time it was read and when the shared-lock was obtained ** on WAL_READ_LOCK(mxI) was obtained to account for the possibility ** that the log file may have been wrapped by a writer, or that frames ** that occur later in the log than pWal->hdr.mxFrame may have been ** copied into the database by a checkpointer. If either of these things ** happened, then reading the database with the current value of ** pWal->hdr.mxFrame risks reading a corrupted snapshot. So, retry ** instead. ** ** Before checking that the live wal-index header has not changed ** since it was read, set Wal.minFrame to the first frame in the wal ** file that has not yet been checkpointed. This client will not need ** to read any frames earlier than minFrame from the wal file - they ** can be safely read directly from the database file. ** ** Because a ShmBarrier() call is made between taking the copy of ** nBackfill and checking that the wal-header in shared-memory still ** matches the one cached in pWal->hdr, it is guaranteed that the ** checkpointer that set nBackfill was not working with a wal-index ** header newer than that cached in pWal->hdr. If it were, that could ** cause a problem. The checkpointer could omit to checkpoint ** a version of page X that lies before pWal->minFrame (call that version ** A) on the basis that there is a newer version (version B) of the same ** page later in the wal file. But if version B happens to like past ** frame pWal->hdr.mxFrame - then the client would incorrectly assume ** that it can read version A from the database file. However, since ** we can guarantee that the checkpointer that set nBackfill could not ** see any pages past pWal->hdr.mxFrame, this problem does not come up. */ pWal->minFrame = pInfo->nBackfill+1; walShmBarrier(pWal); if( pInfo->aReadMark[mxI]!=mxReadMark || memcmp((void *)walIndexHdr(pWal), &pWal->hdr, sizeof(WalIndexHdr)) ){ walUnlockShared(pWal, WAL_READ_LOCK(mxI)); return WAL_RETRY; }else{ assert( mxReadMark<=pWal->hdr.mxFrame ); pWal->readLock = (i16)mxI; } return rc; } /* ** Begin a read transaction on the database. ** |
︙ | ︙ | |||
2369 2370 2371 2372 2373 2374 2375 | int sqlite3WalBeginReadTransaction(Wal *pWal, int *pChanged){ int rc; /* Return code */ int cnt = 0; /* Number of TryBeginRead attempts */ #ifdef SQLITE_ENABLE_SNAPSHOT int bChanged = 0; WalIndexHdr *pSnapshot = pWal->pSnapshot; | | | | < < | > < < | < < < < < | < < < < < | | | < < < < < | | | 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 2402 2403 2404 2405 2406 2407 2408 2409 2410 2411 2412 2413 2414 2415 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 2433 2434 2435 2436 2437 2438 2439 2440 2441 | int sqlite3WalBeginReadTransaction(Wal *pWal, int *pChanged){ int rc; /* Return code */ int cnt = 0; /* Number of TryBeginRead attempts */ #ifdef SQLITE_ENABLE_SNAPSHOT int bChanged = 0; WalIndexHdr *pSnapshot = pWal->pSnapshot; if( pSnapshot && memcmp(pSnapshot, &pWal->hdr, sizeof(WalIndexHdr))!=0 ){ bChanged = 1; } #endif do{ rc = walTryBeginRead(pWal, pChanged, 0, ++cnt); }while( rc==WAL_RETRY ); testcase( (rc&0xff)==SQLITE_BUSY ); testcase( (rc&0xff)==SQLITE_IOERR ); testcase( rc==SQLITE_PROTOCOL ); testcase( rc==SQLITE_OK ); #ifdef SQLITE_ENABLE_SNAPSHOT if( rc==SQLITE_OK ){ if( pSnapshot && memcmp(pSnapshot, &pWal->hdr, sizeof(WalIndexHdr))!=0 ){ /* At this point the client has a lock on an aReadMark[] slot holding ** a value equal to or smaller than pSnapshot->mxFrame. Verify that ** pSnapshot is still valid before continuing. Reasons why pSnapshot ** might no longer be valid: ** ** (1) The WAL file has been reset since the snapshot was taken. ** In this case, the salt will have changed. ** ** (2) A checkpoint as been attempted that wrote frames past ** pSnapshot->mxFrame into the database file. Note that the ** checkpoint need not have completed for this to cause problems. */ volatile WalCkptInfo *pInfo = walCkptInfo(pWal); assert( pWal->readLock>0 ); assert( pInfo->aReadMark[pWal->readLock]<=pSnapshot->mxFrame ); /* Check that the wal file has not been wrapped. Assuming it has not, ** overwrite pWal->hdr with *pSnapshot and set *pChanged as appropriate ** for opening the snapshot. Or, if the wal file has been wrapped ** since pSnapshot was written, return SQLITE_BUSY_SNAPSHOT. */ if( memcmp(pSnapshot->aSalt, pWal->hdr.aSalt, sizeof(pWal->hdr.aSalt))==0 && pSnapshot->mxFrame>=pInfo->nBackfillAttempted ){ memcpy(&pWal->hdr, pSnapshot, sizeof(WalIndexHdr)); *pChanged = bChanged; }else{ rc = SQLITE_BUSY_SNAPSHOT; } |
︙ | ︙ |