Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Overview
Comment: | In cases where a readonly_shm client cannot take the DMS lock on the *-shm file, have it parse the wal file and create a wal-index to access it in heap memory. |
---|---|
Downloads: | Tarball | ZIP archive |
Timelines: | family | ancestors | descendants | both | readonly-wal-recovery |
Files: | files | file ages | folders |
SHA3-256: |
18b268433d739486eac1b04947bd4186 |
User & Date: | dan 2017-11-04 18:10:03.528 |
References
2017-11-13
| ||
05:51 | Remove some branches in walTryBeginRead() that were added by check-in [ce5d13c2de] but became unreachable with the addition of logic in check-in [18b26843] that enabled read-only clients to parse the WAL file into a heap-memory WAL-index, thus guaranteeing that the WAL-index header is always available. (check-in: 9c6b38b9a9 user: drh tags: readonly-wal-recovery) | |
Context
2017-11-04
| ||
21:06 | Add further tests for the code added on this branch. (check-in: a6716fcde3 user: dan tags: readonly-wal-recovery) | |
18:10 | In cases where a readonly_shm client cannot take the DMS lock on the *-shm file, have it parse the wal file and create a wal-index to access it in heap memory. (check-in: 18b268433d user: dan tags: readonly-wal-recovery) | |
2017-11-02
| ||
18:57 | Fix test cases in wal2.test broken by the locking change in the previous commit. (check-in: f569c35172 user: dan tags: readonly-wal-recovery) | |
Changes
Changes to src/wal.c.
︙ | ︙ | |||
451 452 453 454 455 456 457 458 459 460 461 462 463 464 | u8 exclusiveMode; /* Non-zero if connection is in exclusive mode */ u8 writeLock; /* True if in a write transaction */ u8 ckptLock; /* True if holding a checkpoint lock */ u8 readOnly; /* WAL_RDWR, WAL_RDONLY, or WAL_SHM_RDONLY */ u8 truncateOnCommit; /* True to truncate WAL file on commit */ u8 syncHeader; /* Fsync the WAL header if true */ u8 padToSectorBoundary; /* Pad transactions out to the next sector */ WalIndexHdr hdr; /* Wal-index header for current transaction */ u32 minFrame; /* Ignore wal frames before this one */ u32 iReCksum; /* On commit, recalculate checksums from here */ const char *zWalName; /* Name of WAL file */ u32 nCkpt; /* Checkpoint sequence counter in the wal-header */ #ifdef SQLITE_DEBUG u8 lockError; /* True if a locking error has occurred */ | > | 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 | u8 exclusiveMode; /* Non-zero if connection is in exclusive mode */ u8 writeLock; /* True if in a write transaction */ u8 ckptLock; /* True if holding a checkpoint lock */ u8 readOnly; /* WAL_RDWR, WAL_RDONLY, or WAL_SHM_RDONLY */ u8 truncateOnCommit; /* True to truncate WAL file on commit */ u8 syncHeader; /* Fsync the WAL header if true */ u8 padToSectorBoundary; /* Pad transactions out to the next sector */ u8 bUnlocked; WalIndexHdr hdr; /* Wal-index header for current transaction */ u32 minFrame; /* Ignore wal frames before this one */ u32 iReCksum; /* On commit, recalculate checksums from here */ const char *zWalName; /* Name of WAL file */ u32 nCkpt; /* Checkpoint sequence counter in the wal-header */ #ifdef SQLITE_DEBUG u8 lockError; /* True if a locking error has occurred */ |
︙ | ︙ | |||
1266 1267 1268 1269 1270 1271 1272 | return rc; } /* ** Close an open wal-index. */ static void walIndexClose(Wal *pWal, int isDelete){ | | < > > | 1267 1268 1269 1270 1271 1272 1273 1274 1275 1276 1277 1278 1279 1280 1281 1282 1283 1284 1285 1286 1287 1288 | return rc; } /* ** Close an open wal-index. */ static void walIndexClose(Wal *pWal, int isDelete){ if( pWal->exclusiveMode==WAL_HEAPMEMORY_MODE || pWal->bUnlocked ){ int i; for(i=0; i<pWal->nWiData; i++){ sqlite3_free((void *)pWal->apWiData[i]); pWal->apWiData[i] = 0; } } if( pWal->exclusiveMode!=WAL_HEAPMEMORY_MODE ){ sqlite3OsShmUnmap(pWal->pDbFd, isDelete); } } /* ** Open a connection to the WAL file zWalName. The database file must ** already be opened on connection pDbFd. The buffer that zWalName points |
︙ | ︙ | |||
2087 2088 2089 2090 2091 2092 2093 | volatile u32 *page0; /* Chunk of wal-index containing header */ /* Ensure that page 0 of the wal-index (the page that contains the ** wal-index header) is mapped. Return early if an error occurs here. */ assert( pChanged ); rc = walIndexPage(pWal, 0, &page0); | < | < | < < | > | | > | | 2089 2090 2091 2092 2093 2094 2095 2096 2097 2098 2099 2100 2101 2102 2103 2104 2105 2106 2107 2108 2109 2110 2111 2112 2113 2114 2115 2116 2117 2118 2119 2120 2121 2122 2123 2124 2125 2126 | volatile u32 *page0; /* Chunk of wal-index containing header */ /* Ensure that page 0 of the wal-index (the page that contains the ** wal-index header) is mapped. Return early if an error occurs here. */ assert( pChanged ); rc = walIndexPage(pWal, 0, &page0); if( rc==SQLITE_READONLY_CANTLOCK ){ assert( page0==0 && pWal->writeLock==0 ); pWal->bUnlocked = 1; pWal->exclusiveMode = WAL_HEAPMEMORY_MODE; *pChanged = 1; }else if( rc!=SQLITE_OK ){ return rc; }; assert( page0 || pWal->writeLock==0 ); /* If the first page of the wal-index has been mapped, try to read the ** wal-index header immediately, without holding any lock. This usually ** works, but may fail if the wal-index header is corrupt or currently ** being modified by another thread or process. */ badHdr = (page0 ? walIndexTryHdr(pWal, pChanged) : 1); /* If the first attempt failed, it might have been due to a race ** with a writer. So get a WRITE lock and try again. */ assert( badHdr==0 || pWal->writeLock==0 ); if( badHdr ){ if( pWal->bUnlocked==0 && (pWal->readOnly & WAL_SHM_RDONLY) ){ if( SQLITE_OK==(rc = walLockShared(pWal, WAL_WRITE_LOCK)) ){ walUnlockShared(pWal, WAL_WRITE_LOCK); rc = SQLITE_READONLY_RECOVERY; } }else if( SQLITE_OK==(rc = walLockExclusive(pWal, WAL_WRITE_LOCK, 1)) ){ pWal->writeLock = 1; if( SQLITE_OK==(rc = walIndexPage(pWal, 0, &page0)) ){ |
︙ | ︙ | |||
2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 | /* If the header is read successfully, check the version number to make ** sure the wal-index was not constructed with some future format that ** this version of SQLite cannot understand. */ if( badHdr==0 && pWal->hdr.iVersion!=WALINDEX_MAX_VERSION ){ rc = SQLITE_CANTOPEN_BKPT; } return rc; } /* ** This is the value that walTryBeginRead returns when it needs to ** be retried. */ #define WAL_RETRY (-1) /* ** Attempt to start a read transaction. This might fail due to a race or ** other transient condition. When that happens, it returns WAL_RETRY to ** indicate to the caller that it is safe to retry immediately. ** ** On success return SQLITE_OK. On a permanent failure (such an | > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > > | 2142 2143 2144 2145 2146 2147 2148 2149 2150 2151 2152 2153 2154 2155 2156 2157 2158 2159 2160 2161 2162 2163 2164 2165 2166 2167 2168 2169 2170 2171 2172 2173 2174 2175 2176 2177 2178 2179 2180 2181 2182 2183 2184 2185 2186 2187 2188 2189 2190 2191 2192 2193 2194 2195 2196 2197 2198 2199 2200 2201 2202 2203 2204 2205 2206 2207 2208 2209 2210 2211 2212 2213 2214 2215 2216 2217 2218 2219 2220 2221 2222 2223 2224 2225 2226 2227 2228 2229 2230 2231 2232 2233 2234 2235 2236 2237 2238 2239 2240 2241 2242 2243 2244 2245 2246 2247 2248 2249 2250 2251 2252 2253 2254 2255 2256 2257 2258 2259 2260 2261 2262 2263 2264 2265 2266 2267 2268 2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 2283 2284 2285 2286 2287 2288 2289 2290 2291 2292 2293 2294 2295 2296 2297 2298 2299 2300 2301 2302 2303 2304 2305 2306 2307 2308 | /* If the header is read successfully, check the version number to make ** sure the wal-index was not constructed with some future format that ** this version of SQLite cannot understand. */ if( badHdr==0 && pWal->hdr.iVersion!=WALINDEX_MAX_VERSION ){ rc = SQLITE_CANTOPEN_BKPT; } if( pWal->bUnlocked ){ if( rc!=SQLITE_OK ){ walIndexClose(pWal, 0); } pWal->exclusiveMode = WAL_NORMAL_MODE; } return rc; } /* ** This is the value that walTryBeginRead returns when it needs to ** be retried. */ #define WAL_RETRY (-1) /* ** Open an "unlocked" transaction. An unlocked transaction is a read ** transaction used by a read-only client in cases where the *-shm ** file cannot be mapped and its contents cannot be trusted. It is ** assumed that the *-wal file has been read and that a wal-index ** constructed in heap memory is currently available in Wal.apWiData[]. ** ** If this function returns SQLITE_OK, then the read transaction has ** been successfully opened. In this case output variable (*pChanged) ** is set to true before returning if the caller should discard the ** contents of the page cache before proceeding. Or, if it returns ** WAL_RETRY, then the heap memory wal-index has been discarded and ** the caller should retry opening the read transaction from the ** beginning (including attempting to map the *-shm file). ** ** If an error occurs, an SQLite error code is returned. */ static int walBeginUnlocked(Wal *pWal, int *pChanged){ i64 szWal; /* Size of wal file on disk in bytes */ i64 iOffset; /* Current offset when reading wal file */ u8 aBuf[WAL_HDRSIZE]; /* Buffer to load WAL header into */ u8 *aFrame = 0; /* Malloc'd buffer to load entire frame */ int szFrame; /* Number of bytes in buffer aFrame[] */ u8 *aData; /* Pointer to data part of aFrame buffer */ volatile void *pDummy; /* Dummy argument for xShmMap */ int rc; /* Return code */ u32 aSaveCksum[2]; /* Saved copy of pWal->hdr.aFrameCksum */ assert( pWal->bUnlocked ); assert( pWal->readOnly & WAL_SHM_RDONLY ); assert( pWal->nWiData>0 && pWal->apWiData[0] ); /* Take WAL_READ_LOCK(0). This has the effect of preventing any ** live clients from running a checkpoint, but does not stop them ** from running recovery. */ rc = walLockShared(pWal, WAL_READ_LOCK(0)); if( rc!=SQLITE_OK ){ return (rc==SQLITE_BUSY ? WAL_RETRY : rc); } pWal->readLock = 0; /* Try to map the *-shm file again. If it succeeds this time, then ** a non-readonly_shm connection has already connected to the database. ** In this case, start over with opening the transaction. ** ** The WAL_READ_LOCK(0) lock held by this client prevents a checkpoint ** from taking place. But it does not prevent the wal from being wrapped ** if a checkpoint has already taken place. This means that if another ** client is connected at this point, it may have already checkpointed ** the entire wal. In that case it would not be safe to continue with ** the unlocked transaction, as the other client may overwrite wal ** frames that this client is still using. */ rc = sqlite3OsShmMap(pWal->pDbFd, 0, WALINDEX_PGSZ, 0, &pDummy); if( rc!=SQLITE_READONLY_CANTLOCK ){ assert( rc!=SQLITE_OK ); rc = (rc==SQLITE_READONLY ? WAL_RETRY : rc); goto begin_unlocked_out; } memcpy(&pWal->hdr, (void*)walIndexHdr(pWal), sizeof(WalIndexHdr)); rc = sqlite3OsFileSize(pWal->pWalFd, &szWal); if( rc!=SQLITE_OK || (szWal<WAL_HDRSIZE && pWal->hdr.mxFrame==0) ){ /* If the wal file is too small to contain a wal-header and the ** wal-index header has mxFrame==0, then it must be safe to proceed ** reading the database file only. However, the page cache cannot ** be trusted, as a read/write connection may have connected, written ** the db, run a checkpoint, truncated the wal file and disconnected ** since this client's last read transaction. */ *pChanged = 1; goto begin_unlocked_out; } /* Check the salt keys at the start of the wal file still match. */ rc = sqlite3OsRead(pWal->pWalFd, aBuf, WAL_HDRSIZE, 0); if( rc!=SQLITE_OK ){ goto begin_unlocked_out; } if( memcmp(&pWal->hdr.aSalt, &aBuf[16], 8) ){ rc = WAL_RETRY; goto begin_unlocked_out; } /* Allocate a buffer to read frames into */ szFrame = pWal->hdr.szPage + WAL_FRAME_HDRSIZE; aFrame = (u8 *)sqlite3_malloc64(szFrame); if( aFrame==0 ){ rc = SQLITE_NOMEM_BKPT; goto begin_unlocked_out; } aData = &aFrame[WAL_FRAME_HDRSIZE]; aSaveCksum[0] = pWal->hdr.aFrameCksum[0]; aSaveCksum[1] = pWal->hdr.aFrameCksum[1]; for(iOffset=walFrameOffset(pWal->hdr.mxFrame+1, pWal->hdr.szPage); iOffset+szFrame<=szWal; iOffset+=szFrame ){ u32 pgno; /* Database page number for frame */ u32 nTruncate; /* dbsize field from frame header */ /* Read and decode the next log frame. */ rc = sqlite3OsRead(pWal->pWalFd, aFrame, szFrame, iOffset); if( rc!=SQLITE_OK ){ if( rc==SQLITE_IOERR_SHORT_READ ){ /* If this branch is taken, some other client has truncated the ** *-wal file since the call to sqlite3OsFileSize() above. This ** indicates that a read-write client has connected to the system. ** So retry opening this read transaction. */ rc = WAL_RETRY; } break; } if( !walDecodeFrame(pWal, &pgno, &nTruncate, aData, aFrame) ) break; /* If nTruncate is non-zero, this is a commit record. */ if( nTruncate ){ rc = WAL_RETRY; break; } } pWal->hdr.aFrameCksum[0] = aSaveCksum[0]; pWal->hdr.aFrameCksum[1] = aSaveCksum[1]; begin_unlocked_out: sqlite3_free(aFrame); if( rc!=SQLITE_OK ){ int i; for(i=0; i<pWal->nWiData; i++){ sqlite3_free((void*)pWal->apWiData[i]); pWal->apWiData[i] = 0; } pWal->bUnlocked = 0; sqlite3WalEndReadTransaction(pWal); *pChanged = 1; } return rc; } /* ** Attempt to start a read transaction. This might fail due to a race or ** other transient condition. When that happens, it returns WAL_RETRY to ** indicate to the caller that it is safe to retry immediately. ** ** On success return SQLITE_OK. On a permanent failure (such an |
︙ | ︙ | |||
2240 2241 2242 2243 2244 2245 2246 | return SQLITE_PROTOCOL; } if( cnt>=10 ) nDelay = (cnt-9)*(cnt-9)*39; sqlite3OsSleep(pWal->pVfs, nDelay); } if( !useWal ){ | > > | > | 2384 2385 2386 2387 2388 2389 2390 2391 2392 2393 2394 2395 2396 2397 2398 2399 2400 2401 | return SQLITE_PROTOCOL; } if( cnt>=10 ) nDelay = (cnt-9)*(cnt-9)*39; sqlite3OsSleep(pWal->pVfs, nDelay); } if( !useWal ){ assert( rc==SQLITE_OK ); if( pWal->bUnlocked==0 ){ rc = walIndexReadHdr(pWal, pChanged); } if( rc==SQLITE_BUSY ){ /* If there is not a recovery running in another thread or process ** then convert BUSY errors to WAL_RETRY. If recovery is known to ** be running, convert BUSY to BUSY_RECOVERY. There is a race here ** which might cause WAL_RETRY to be returned even if BUSY_RECOVERY ** would be technically correct. But the race is benign since with ** WAL_RETRY this routine will be called again and will probably be |
︙ | ︙ | |||
2269 2270 2271 2272 2273 2274 2275 2276 2277 2278 2279 2280 2281 2282 | }else if( rc==SQLITE_BUSY ){ rc = SQLITE_BUSY_RECOVERY; } } if( rc!=SQLITE_OK ){ return rc; } } assert( pWal->nWiData>0 ); assert( pWal->apWiData[0] || (pWal->readOnly & WAL_SHM_RDONLY) ); pInfo = pWal->apWiData[0] ? walCkptInfo(pWal) : 0; if( !useWal && (pInfo==0 || pInfo->nBackfill==pWal->hdr.mxFrame) #ifdef SQLITE_ENABLE_SNAPSHOT | > > > | 2416 2417 2418 2419 2420 2421 2422 2423 2424 2425 2426 2427 2428 2429 2430 2431 2432 | }else if( rc==SQLITE_BUSY ){ rc = SQLITE_BUSY_RECOVERY; } } if( rc!=SQLITE_OK ){ return rc; } else if( pWal->bUnlocked ){ return walBeginUnlocked(pWal, pChanged); } } assert( pWal->nWiData>0 ); assert( pWal->apWiData[0] || (pWal->readOnly & WAL_SHM_RDONLY) ); pInfo = pWal->apWiData[0] ? walCkptInfo(pWal) : 0; if( !useWal && (pInfo==0 || pInfo->nBackfill==pWal->hdr.mxFrame) #ifdef SQLITE_ENABLE_SNAPSHOT |
︙ | ︙ | |||
2622 2623 2624 2625 2626 2627 2628 | /* If the "last page" field of the wal-index header snapshot is 0, then ** no data will be read from the wal under any circumstances. Return early ** in this case as an optimization. Likewise, if pWal->readLock==0, ** then the WAL is ignored by the reader so return early, as if the ** WAL were empty. */ | | | 2772 2773 2774 2775 2776 2777 2778 2779 2780 2781 2782 2783 2784 2785 2786 | /* If the "last page" field of the wal-index header snapshot is 0, then ** no data will be read from the wal under any circumstances. Return early ** in this case as an optimization. Likewise, if pWal->readLock==0, ** then the WAL is ignored by the reader so return early, as if the ** WAL were empty. */ if( iLast==0 || (pWal->readLock==0 && pWal->bUnlocked==0) ){ *piRead = 0; return SQLITE_OK; } /* Search the hash table or tables for an entry matching page number ** pgno. Each iteration of the following for() loop searches one ** hash table (each hash table indexes up to HASHTABLE_NPAGE frames). |
︙ | ︙ |
Changes to test/walro.test.
︙ | ︙ | |||
101 102 103 104 105 106 107 | code1 { db close } list [file exists test.db-wal] [file exists test.db-shm] } {1 1} do_test 1.2.2 { code1 { sqlite3 db file:test.db?readonly_shm=1 } list [catch { sql1 { SELECT * FROM t1 } } msg] $msg | | | | | 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 | code1 { db close } list [file exists test.db-wal] [file exists test.db-shm] } {1 1} do_test 1.2.2 { code1 { sqlite3 db file:test.db?readonly_shm=1 } list [catch { sql1 { SELECT * FROM t1 } } msg] $msg } {0 {a b c d e f g h i j}} do_test 1.2.3 { code1 { db close } file attributes test.db-shm -permissions rw-r--r-- hexio_write test.db-shm 0 01020304 file attributes test.db-shm -permissions r--r--r-- code1 { sqlite3 db file:test.db?readonly_shm=1 } csql1 { SELECT * FROM t1 } } {0 {a b c d e f g h i j}} do_test 1.2.4 { code1 { sqlite3_extended_errcode db } } {SQLITE_OK} do_test 1.2.5 { file attributes test.db-shm -permissions rw-r--r-- code2 { sqlite3 db2 test.db } sql2 "SELECT * FROM t1" } {a b c d e f g h i j} file attributes test.db-shm -permissions r--r--r-- |
︙ | ︙ | |||
158 159 160 161 162 163 164 | } {1 {unable to open database file}} do_test 1.3.2.3 { code1 { db close } close [open test.db-shm w] file attributes test.db-shm -permissions r--r--r-- code1 { sqlite3 db file:test.db?readonly_shm=1 } csql1 { SELECT * FROM t1 } | | | | 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 | } {1 {unable to open database file}} do_test 1.3.2.3 { code1 { db close } close [open test.db-shm w] file attributes test.db-shm -permissions r--r--r-- code1 { sqlite3 db file:test.db?readonly_shm=1 } csql1 { SELECT * FROM t1 } } {0 {a b c d e f g h i j k l}} do_test 1.3.2.4 { code1 { sqlite3_extended_errcode db } } {SQLITE_OK} #----------------------------------------------------------------------- # Test cases 1.4.* check that checkpoints and log wraps don't prevent # read-only connections from reading the database. do_test 1.4.1 { code1 { db close } forcedelete test.db-shm |
︙ | ︙ |
Changes to test/walro2.test.
︙ | ︙ | |||
58 59 60 61 62 63 64 | PRAGMA journal_mode = WAL; INSERT INTO t1 VALUES('a', 'b'); INSERT INTO t1 VALUES('c', 'd'); } file exists test.db-shm } {1} | | | > > > | 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 | PRAGMA journal_mode = WAL; INSERT INTO t1 VALUES('a', 'b'); INSERT INTO t1 VALUES('c', 'd'); } file exists test.db-shm } {1} do_test 1.2.1 { forcecopy test.db test.db2 forcecopy test.db-wal test.db2-wal forcecopy test.db-shm test.db2-shm code1 { sqlite3 db file:test.db2?readonly_shm=1 } sql1 { SELECT * FROM t1 } } {a b c d} do_test 1.2.2 { sql1 { SELECT * FROM t1 } } {a b c d} do_test 1.3.1 { code3 { sqlite3 db3 test.db2 } sql3 { SELECT * FROM t1 } } {a b c d} do_test 1.3.2 { |
︙ | ︙ | |||
102 103 104 105 106 107 108 | code1 { sqlite3 db file:test.db2?readonly_shm=1 } sql1 { BEGIN; SELECT * FROM t1; } | | | > > > > > > > > | 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 | code1 { sqlite3 db file:test.db2?readonly_shm=1 } sql1 { BEGIN; SELECT * FROM t1; } } {a b c d e f g h} do_test 2.3.1 { code3 { sqlite3 db3 test.db2 } sql3 { SELECT * FROM t1 } } {a b c d e f g h} do_test 2.3.2 { sql3 { INSERT INTO t1 VALUES('i', 'j') } code3 { db3 close } sql1 { COMMIT } } {} breakpoint do_test 2.3.3 { sql1 { SELECT * FROM t1 } } {a b c d e f g h i j} } finish_test |