/ Check-in [d3c25740]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Improved comments and variable names in the read-only WAL logic.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | readonly-wal-recovery
Files: files | file ages | folders
SHA3-256: d3c25740eec9a2a41c29e6e488fcf6587c1fb821147a442c29439b25a92154a5
User & Date: drh 2017-11-10 20:00:50
Context
2017-11-11
13:30
Further comment improvements in wal.c. No code changes. check-in: 34638800 user: drh tags: readonly-wal-recovery
2017-11-10
20:00
Improved comments and variable names in the read-only WAL logic. check-in: d3c25740 user: drh tags: readonly-wal-recovery
2017-11-09
23:24
Avoid superfluous SHM unlock call in the Win32 VFS. check-in: 5a384be6 user: mistachkin tags: readonly-wal-recovery
Changes
Hide Diffs Side-by-Side Diffs Ignore Whitespace Patch

Changes to src/wal.c.

   451    451     u8 exclusiveMode;          /* Non-zero if connection is in exclusive mode */
   452    452     u8 writeLock;              /* True if in a write transaction */
   453    453     u8 ckptLock;               /* True if holding a checkpoint lock */
   454    454     u8 readOnly;               /* WAL_RDWR, WAL_RDONLY, or WAL_SHM_RDONLY */
   455    455     u8 truncateOnCommit;       /* True to truncate WAL file on commit */
   456    456     u8 syncHeader;             /* Fsync the WAL header if true */
   457    457     u8 padToSectorBoundary;    /* Pad transactions out to the next sector */
   458         -  u8 bUnlocked;
          458  +  u8 bShmUnreliable;         /* SHM content is read-only and unreliable */
   459    459     WalIndexHdr hdr;           /* Wal-index header for current transaction */
   460    460     u32 minFrame;              /* Ignore wal frames before this one */
   461    461     u32 iReCksum;              /* On commit, recalculate checksums from here */
   462    462     const char *zWalName;      /* Name of WAL file */
   463    463     u32 nCkpt;                 /* Checkpoint sequence counter in the wal-header */
   464    464   #ifdef SQLITE_DEBUG
   465    465     u8 lockError;              /* True if a locking error has occurred */
................................................................................
  1267   1267     return rc;
  1268   1268   }
  1269   1269   
  1270   1270   /*
  1271   1271   ** Close an open wal-index.
  1272   1272   */
  1273   1273   static void walIndexClose(Wal *pWal, int isDelete){
  1274         -  if( pWal->exclusiveMode==WAL_HEAPMEMORY_MODE || pWal->bUnlocked ){
         1274  +  if( pWal->exclusiveMode==WAL_HEAPMEMORY_MODE || pWal->bShmUnreliable ){
  1275   1275       int i;
  1276   1276       for(i=0; i<pWal->nWiData; i++){
  1277   1277         sqlite3_free((void *)pWal->apWiData[i]);
  1278   1278         pWal->apWiData[i] = 0;
  1279   1279       }
  1280   1280     }
  1281   1281     if( pWal->exclusiveMode!=WAL_HEAPMEMORY_MODE ){
................................................................................
  2095   2095     volatile u32 *page0;            /* Chunk of wal-index containing header */
  2096   2096   
  2097   2097     /* Ensure that page 0 of the wal-index (the page that contains the 
  2098   2098     ** wal-index header) is mapped. Return early if an error occurs here.
  2099   2099     */
  2100   2100     assert( pChanged );
  2101   2101     rc = walIndexPage(pWal, 0, &page0);
  2102         -  if( rc==SQLITE_READONLY_CANTINIT ){
  2103         -    assert( page0==0 && pWal->writeLock==0 );
  2104         -    pWal->bUnlocked = 1;
  2105         -    pWal->exclusiveMode = WAL_HEAPMEMORY_MODE;
  2106         -    *pChanged = 1;
  2107         -  }else
  2108   2102     if( rc!=SQLITE_OK ){
  2109         -    return rc;
         2103  +    assert( rc!=SQLITE_READONLY ); /* READONLY changed to OK in walIndexPage */
         2104  +    if( rc==SQLITE_READONLY_CANTINIT ){
         2105  +      /* The SQLITE_READONLY_CANTINIT return means that the shared-memory
         2106  +      ** was openable but is not writable, and this thread is unable to
         2107  +      ** confirm that another write-capable connection has the shared-memory
         2108  +      ** open, and hence the content of the shared-memory is unreliable,
         2109  +      ** since the shared-memory might be inconsistent with the WAL file
         2110  +      ** and there is no writer on hand to fix it. */
         2111  +      assert( page0==0 && pWal->writeLock==0 );
         2112  +      pWal->bShmUnreliable = 1;
         2113  +      pWal->exclusiveMode = WAL_HEAPMEMORY_MODE;
         2114  +      *pChanged = 1;
         2115  +    }else{
         2116  +      return rc; /* Any other non-OK return is just an error */
         2117  +    }
  2110   2118     };
  2111   2119     assert( page0 || pWal->writeLock==0 );
  2112   2120   
  2113   2121     /* If the first page of the wal-index has been mapped, try to read the
  2114   2122     ** wal-index header immediately, without holding any lock. This usually
  2115   2123     ** works, but may fail if the wal-index header is corrupt or currently 
  2116   2124     ** being modified by another thread or process.
................................................................................
  2118   2126     badHdr = (page0 ? walIndexTryHdr(pWal, pChanged) : 1);
  2119   2127   
  2120   2128     /* If the first attempt failed, it might have been due to a race
  2121   2129     ** with a writer.  So get a WRITE lock and try again.
  2122   2130     */
  2123   2131     assert( badHdr==0 || pWal->writeLock==0 );
  2124   2132     if( badHdr ){
  2125         -    if( pWal->bUnlocked==0 && (pWal->readOnly & WAL_SHM_RDONLY) ){
         2133  +    if( pWal->bShmUnreliable==0 && (pWal->readOnly & WAL_SHM_RDONLY) ){
  2126   2134         if( SQLITE_OK==(rc = walLockShared(pWal, WAL_WRITE_LOCK)) ){
  2127   2135           walUnlockShared(pWal, WAL_WRITE_LOCK);
  2128   2136           rc = SQLITE_READONLY_RECOVERY;
  2129   2137         }
  2130   2138       }else if( SQLITE_OK==(rc = walLockExclusive(pWal, WAL_WRITE_LOCK, 1)) ){
  2131   2139         pWal->writeLock = 1;
  2132   2140         if( SQLITE_OK==(rc = walIndexPage(pWal, 0, &page0)) ){
................................................................................
  2148   2156     /* If the header is read successfully, check the version number to make
  2149   2157     ** sure the wal-index was not constructed with some future format that
  2150   2158     ** this version of SQLite cannot understand.
  2151   2159     */
  2152   2160     if( badHdr==0 && pWal->hdr.iVersion!=WALINDEX_MAX_VERSION ){
  2153   2161       rc = SQLITE_CANTOPEN_BKPT;
  2154   2162     }
  2155         -  if( pWal->bUnlocked ){
         2163  +  if( pWal->bShmUnreliable ){
  2156   2164       if( rc!=SQLITE_OK ){
  2157   2165         walIndexClose(pWal, 0);
  2158         -      pWal->bUnlocked = 0;
         2166  +      pWal->bShmUnreliable = 0;
  2159   2167         assert( pWal->nWiData>0 && pWal->apWiData[0]==0 );
  2160   2168         if( rc==SQLITE_IOERR_SHORT_READ ) rc = WAL_RETRY;
  2161   2169       }
  2162   2170       pWal->exclusiveMode = WAL_NORMAL_MODE;
  2163   2171     }
  2164   2172   
  2165   2173     return rc;
  2166   2174   }
  2167   2175   
  2168   2176   /*
  2169         -** Open an "unlocked" transaction. An unlocked transaction is a read 
  2170         -** transaction used by a read-only client in cases where the *-shm
  2171         -** file cannot be mapped and its contents cannot be trusted. It is
  2172         -** assumed that the *-wal file has been read and that a wal-index 
  2173         -** constructed in heap memory is currently available in Wal.apWiData[].
         2177  +** Open a transaction in a connection where the shared-memory is read-only
         2178  +** and where we cannot verify that there is a separate write-capable connection
         2179  +** on hand to keep the shared-memory up-to-date with the WAL file.
         2180  +**
         2181  +** This can happen, for example, when the shared-memory is implemented by
         2182  +** memory-mapping a *-shm file, where a prior writer has shut down and
         2183  +** left the *-shm file on disk, and now the present connection is trying
         2184  +** to use that database but lacks write permission on the *-shm file.
         2185  +** Other scenarios are also possible, depending on the VFS implementation.
         2186  +**
         2187  +** Precondition:
         2188  +**
         2189  +**    The *-wal file has been read and an appropriate wal-index has been
         2190  +**    constructed in pWal->apWiData[] using heap memory instead of shared
         2191  +**    memory. 
  2174   2192   **
  2175   2193   ** If this function returns SQLITE_OK, then the read transaction has
  2176   2194   ** been successfully opened. In this case output variable (*pChanged) 
  2177   2195   ** is set to true before returning if the caller should discard the
  2178   2196   ** contents of the page cache before proceeding. Or, if it returns 
  2179   2197   ** WAL_RETRY, then the heap memory wal-index has been discarded and 
  2180   2198   ** the caller should retry opening the read transaction from the 
  2181   2199   ** beginning (including attempting to map the *-shm file). 
  2182   2200   **
  2183   2201   ** If an error occurs, an SQLite error code is returned.
  2184   2202   */
  2185         -static int walBeginUnlocked(Wal *pWal, int *pChanged){
         2203  +static int walBeginShmUnreliable(Wal *pWal, int *pChanged){
  2186   2204     i64 szWal;                      /* Size of wal file on disk in bytes */
  2187   2205     i64 iOffset;                    /* Current offset when reading wal file */
  2188   2206     u8 aBuf[WAL_HDRSIZE];           /* Buffer to load WAL header into */
  2189   2207     u8 *aFrame = 0;                 /* Malloc'd buffer to load entire frame */
  2190   2208     int szFrame;                    /* Number of bytes in buffer aFrame[] */
  2191   2209     u8 *aData;                      /* Pointer to data part of aFrame buffer */
  2192   2210     volatile void *pDummy;          /* Dummy argument for xShmMap */
  2193   2211     int rc;                         /* Return code */
  2194   2212     u32 aSaveCksum[2];              /* Saved copy of pWal->hdr.aFrameCksum */
  2195   2213   
  2196         -  assert( pWal->bUnlocked );
         2214  +  assert( pWal->bShmUnreliable );
  2197   2215     assert( pWal->readOnly & WAL_SHM_RDONLY );
  2198   2216     assert( pWal->nWiData>0 && pWal->apWiData[0] );
  2199   2217   
  2200   2218     /* Take WAL_READ_LOCK(0). This has the effect of preventing any
  2201         -  ** live clients from running a checkpoint, but does not stop them
         2219  +  ** writers from running a checkpoint, but does not stop them
  2202   2220     ** from running recovery.  */
  2203   2221     rc = walLockShared(pWal, WAL_READ_LOCK(0));
  2204   2222     if( rc!=SQLITE_OK ){
  2205   2223       if( rc==SQLITE_BUSY ) rc = WAL_RETRY;
  2206         -    goto begin_unlocked_out;
         2224  +    goto begin_unreliable_shm_out;
  2207   2225     }
  2208   2226     pWal->readLock = 0;
  2209   2227   
  2210         -  /* Try to map the *-shm file again. If it succeeds this time, then 
  2211         -  ** a non-readonly_shm connection has already connected to the database.
  2212         -  ** In this case, start over with opening the transaction.
         2228  +  /* Check to see if a separate writer has attached to the shared-memory area,
         2229  +  ** thus making the shared-memory "reliable" again.  Do this by invoking
         2230  +  ** the xShmMap() routine of the VFS and looking to see if the return
         2231  +  ** is SQLITE_READONLY instead of SQLITE_READONLY_CANTINIT.
  2213   2232     **
  2214         -  ** The *-shm file was opened read-only, so sqlite3OsShmMap() can never
  2215         -  ** return SQLITE_OK here, as that would imply that it had established
  2216         -  ** a read/write mapping.  A return of SQLITE_READONLY means success - that
  2217         -  ** a mapping has been established to a shared-memory segment that is actively
  2218         -  ** maintained by a writer.  SQLITE_READONLY_CANTINIT means that all
  2219         -  ** all connections to the -shm file are read-only and hence the content
  2220         -  ** of the -shm file might be out-of-date.
  2221         -  ** 
  2222         -  ** The WAL_READ_LOCK(0) lock held by this client prevents a checkpoint
  2223         -  ** from taking place. But it does not prevent the wal from being wrapped
  2224         -  ** if a checkpoint has already taken place. This means that if another
  2225         -  ** client is connected at this point, it may have already checkpointed 
  2226         -  ** the entire wal. In that case it would not be safe to continue with
  2227         -  ** the unlocked transaction, as the other client may overwrite wal 
  2228         -  ** frames that this client is still using.  */
         2233  +  ** Once sqlite3OsShmMap() has been called for a file and has returned
         2234  +  ** any SQLITE_READONLY value, it must SQLITE_READONLY or
         2235  +  ** SQLITE_READONLY_CANTINIT or some error for all subsequent invocations,
         2236  +  ** until sqlite3OsShmUnmap() has been called.  This is a requirement
         2237  +  ** on the VFS implementation.
         2238  +  **
         2239  +  ** If the shared-memory is now "reliable" return WAL_RETRY, which will
         2240  +  ** cause the heap-memory WAL-index to be discarded and the actual
         2241  +  ** shared memory to be used in its place.
         2242  +  */
  2229   2243     rc = sqlite3OsShmMap(pWal->pDbFd, 0, WALINDEX_PGSZ, 0, &pDummy);
  2230   2244     assert( rc!=SQLITE_OK ); /* SQLITE_OK not possible for read-only connection */
  2231   2245     if( rc!=SQLITE_READONLY_CANTINIT ){
  2232   2246       rc = (rc==SQLITE_READONLY ? WAL_RETRY : rc);
  2233         -    goto begin_unlocked_out;
         2247  +    goto begin_unreliable_shm_out;
  2234   2248     }
  2235   2249   
         2250  +  /* Reach this point only if the real shared-memory is still unreliable.
         2251  +  ** Assume the in-memory WAL-index substitute is correct and load it
         2252  +  ** into pWal->hdr.
         2253  +  */
  2236   2254     memcpy(&pWal->hdr, (void*)walIndexHdr(pWal), sizeof(WalIndexHdr));
         2255  +
         2256  +  /* The WAL_READ_LOCK(0) lock held by this client prevents a checkpoint
         2257  +  ** from taking place. But it does not prevent the wal from being wrapped
         2258  +  ** if a checkpoint has already taken place. This means that if another
         2259  +  ** client is connected at this point, it may have already checkpointed 
         2260  +  ** the entire wal. In that case it would not be safe to continue with
         2261  +  ** the this transaction, as the other client may overwrite wal 
         2262  +  ** frames that this client is still using.
         2263  +  */
  2237   2264     rc = sqlite3OsFileSize(pWal->pWalFd, &szWal);
  2238   2265     if( rc!=SQLITE_OK ){
  2239         -    goto begin_unlocked_out;
         2266  +    goto begin_unreliable_shm_out;
  2240   2267     }
  2241   2268     if( szWal<WAL_HDRSIZE ){
  2242   2269       /* If the wal file is too small to contain a wal-header and the
  2243   2270       ** wal-index header has mxFrame==0, then it must be safe to proceed
  2244   2271       ** reading the database file only. However, the page cache cannot
  2245   2272       ** be trusted, as a read/write connection may have connected, written
  2246   2273       ** the db, run a checkpoint, truncated the wal file and disconnected
  2247   2274       ** since this client's last read transaction.  */
  2248   2275       *pChanged = 1;
  2249   2276       rc = (pWal->hdr.mxFrame==0 ? SQLITE_OK : WAL_RETRY);
  2250         -    goto begin_unlocked_out;
         2277  +    goto begin_unreliable_shm_out;
  2251   2278     }
  2252   2279   
  2253   2280     /* Check the salt keys at the start of the wal file still match. */
  2254   2281     rc = sqlite3OsRead(pWal->pWalFd, aBuf, WAL_HDRSIZE, 0);
  2255   2282     if( rc!=SQLITE_OK ){
  2256         -    goto begin_unlocked_out;
         2283  +    goto begin_unreliable_shm_out;
  2257   2284     }
  2258   2285     if( memcmp(&pWal->hdr.aSalt, &aBuf[16], 8) ){
  2259   2286       rc = WAL_RETRY;
  2260         -    goto begin_unlocked_out;
         2287  +    goto begin_unreliable_shm_out;
  2261   2288     }
  2262   2289   
  2263   2290     /* Allocate a buffer to read frames into */
  2264   2291     szFrame = pWal->hdr.szPage + WAL_FRAME_HDRSIZE;
  2265   2292     aFrame = (u8 *)sqlite3_malloc64(szFrame);
  2266   2293     if( aFrame==0 ){
  2267   2294       rc = SQLITE_NOMEM_BKPT;
  2268         -    goto begin_unlocked_out;
         2295  +    goto begin_unreliable_shm_out;
  2269   2296     }
  2270   2297     aData = &aFrame[WAL_FRAME_HDRSIZE];
  2271   2298   
  2272   2299     /* Check to see if a complete transaction has been appended to the
  2273   2300     ** wal file since the heap-memory wal-index was created. If so, the
  2274   2301     ** heap-memory wal-index is discarded and WAL_RETRY returned to
  2275   2302     ** the caller.  */
................................................................................
  2294   2321         rc = WAL_RETRY;
  2295   2322         break;
  2296   2323       }
  2297   2324     }
  2298   2325     pWal->hdr.aFrameCksum[0] = aSaveCksum[0];
  2299   2326     pWal->hdr.aFrameCksum[1] = aSaveCksum[1];
  2300   2327   
  2301         - begin_unlocked_out:
         2328  + begin_unreliable_shm_out:
  2302   2329     sqlite3_free(aFrame);
  2303   2330     if( rc!=SQLITE_OK ){
  2304   2331       int i;
  2305   2332       for(i=0; i<pWal->nWiData; i++){
  2306   2333         sqlite3_free((void*)pWal->apWiData[i]);
  2307   2334         pWal->apWiData[i] = 0;
  2308   2335       }
  2309         -    pWal->bUnlocked = 0;
         2336  +    pWal->bShmUnreliable = 0;
  2310   2337       sqlite3WalEndReadTransaction(pWal);
  2311   2338       *pChanged = 1;
  2312   2339     }
  2313   2340     return rc;
  2314   2341   }
  2315   2342   
  2316   2343   /*
................................................................................
  2398   2425       }
  2399   2426       if( cnt>=10 ) nDelay = (cnt-9)*(cnt-9)*39;
  2400   2427       sqlite3OsSleep(pWal->pVfs, nDelay);
  2401   2428     }
  2402   2429   
  2403   2430     if( !useWal ){
  2404   2431       assert( rc==SQLITE_OK );
  2405         -    if( pWal->bUnlocked==0 ){
         2432  +    if( pWal->bShmUnreliable==0 ){
  2406   2433         rc = walIndexReadHdr(pWal, pChanged);
  2407   2434       }
  2408   2435       if( rc==SQLITE_BUSY ){
  2409   2436         /* If there is not a recovery running in another thread or process
  2410   2437         ** then convert BUSY errors to WAL_RETRY.  If recovery is known to
  2411   2438         ** be running, convert BUSY to BUSY_RECOVERY.  There is a race here
  2412   2439         ** which might cause WAL_RETRY to be returned even if BUSY_RECOVERY
................................................................................
  2429   2456         }else if( rc==SQLITE_BUSY ){
  2430   2457           rc = SQLITE_BUSY_RECOVERY;
  2431   2458         }
  2432   2459       }
  2433   2460       if( rc!=SQLITE_OK ){
  2434   2461         return rc;
  2435   2462       }
  2436         -    else if( pWal->bUnlocked ){
  2437         -      return walBeginUnlocked(pWal, pChanged);
         2463  +    else if( pWal->bShmUnreliable ){
         2464  +      return walBeginShmUnreliable(pWal, pChanged);
  2438   2465       }
  2439   2466     }
  2440   2467   
  2441   2468     assert( pWal->nWiData>0 );
  2442   2469     assert( pWal->apWiData[0] || (pWal->readOnly & WAL_SHM_RDONLY) );
  2443   2470     pInfo = pWal->apWiData[0] ? walCkptInfo(pWal) : 0;
  2444   2471     if( !useWal && (pInfo==0 || pInfo->nBackfill==pWal->hdr.mxFrame)
................................................................................
  2785   2812   
  2786   2813     /* If the "last page" field of the wal-index header snapshot is 0, then
  2787   2814     ** no data will be read from the wal under any circumstances. Return early
  2788   2815     ** in this case as an optimization.  Likewise, if pWal->readLock==0, 
  2789   2816     ** then the WAL is ignored by the reader so return early, as if the 
  2790   2817     ** WAL were empty.
  2791   2818     */
  2792         -  if( iLast==0 || (pWal->readLock==0 && pWal->bUnlocked==0) ){
         2819  +  if( iLast==0 || (pWal->readLock==0 && pWal->bShmUnreliable==0) ){
  2793   2820       *piRead = 0;
  2794   2821       return SQLITE_OK;
  2795   2822     }
  2796   2823   
  2797   2824     /* Search the hash table or tables for an entry matching page number
  2798   2825     ** pgno. Each iteration of the following for() loop searches one
  2799   2826     ** hash table (each hash table indexes up to HASHTABLE_NPAGE frames).
................................................................................
  2848   2875   #ifdef SQLITE_ENABLE_EXPENSIVE_ASSERT
  2849   2876     /* If expensive assert() statements are available, do a linear search
  2850   2877     ** of the wal-index file content. Make sure the results agree with the
  2851   2878     ** result obtained using the hash indexes above.  */
  2852   2879     {
  2853   2880       u32 iRead2 = 0;
  2854   2881       u32 iTest;
  2855         -    assert( pWal->bUnlocked || pWal->minFrame>0 );
         2882  +    assert( pWal->bShmUnreliable || pWal->minFrame>0 );
  2856   2883       for(iTest=iLast; iTest>=pWal->minFrame && iTest>0; iTest--){
  2857   2884         if( walFramePgno(pWal, iTest)==pgno ){
  2858   2885           iRead2 = iTest;
  2859   2886           break;
  2860   2887         }
  2861   2888       }
  2862   2889       assert( iRead==iRead2 );