SQLite4
Check-in [abe0420724ae2a7caf4a3b346d1ed6c12d55baab]
Not logged in

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
SHA1 Hash:abe0420724ae2a7caf4a3b346d1ed6c12d55baab
Date: 2013-03-06 20:15:16
User: dan
Comment:Allow accessing a prefix of the database file using mmap and the remainder using read and write.
Tags And Properties
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to src/lsm.h

223
224
225
226
227
228
229
230
231







232
233
234
235
236
237
238
**   This means that this option may cause the connection to perform a 
**   checkpoint even if the current connection has itself written very little
**   data into the database file.
**
**   The default value is 2048 (checkpoint every 2MB).
**
** LSM_CONFIG_MMAP:
**   A read/write integer parameter. True to use mmap() to access the 
**   database file. False otherwise.







**
** LSM_CONFIG_USE_LOG:
**   A read/write boolean parameter. True (the default) to use the log
**   file normally. False otherwise.
**
** LSM_CONFIG_AUTOMERGE:
**   A read/write integer parameter. The minimum number of segments to







|
|
>
>
>
>
>
>
>







223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
**   This means that this option may cause the connection to perform a 
**   checkpoint even if the current connection has itself written very little
**   data into the database file.
**
**   The default value is 2048 (checkpoint every 2MB).
**
** LSM_CONFIG_MMAP:
**   A read/write integer parameter. If this value is set to 0, then the 
**   database file is accessed using ordinary read/write IO functions. Or,
**   if it is set to 1, then the database file is memory mapped and accessed
**   that way. If this parameter is set to any value N greater than 1, then
**   up to the first N KB of the file are memory mapped, and any remainder
**   accessed using read/write IO.
**
**   The default value is 1 on 64-bit platforms and 32768 on 32-bit platforms.
**   
**
** LSM_CONFIG_USE_LOG:
**   A read/write boolean parameter. True (the default) to use the log
**   file normally. False otherwise.
**
** LSM_CONFIG_AUTOMERGE:
**   A read/write integer parameter. The minimum number of segments to

Changes to src/lsmInt.h

46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
...
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
#define LSM_DFLT_BLOCK_SIZE         (1 * 1024 * 1024)
#define LSM_DFLT_AUTOFLUSH          (1 * 1024 * 1024)
#define LSM_DFLT_AUTOCHECKPOINT     (i64)(2 * 1024 * 1024)
#define LSM_DFLT_AUTOWORK           1
#define LSM_DFLT_LOG_SIZE           (128*1024)
#define LSM_DFLT_AUTOMERGE          4
#define LSM_DFLT_SAFETY             LSM_SAFETY_NORMAL
#define LSM_DFLT_MMAP               LSM_IS_64_BIT
#define LSM_DFLT_MULTIPLE_PROCESSES 1
#define LSM_DFLT_USE_LOG            1

/* Initial values for log file checksums. These are only used if the 
** database file does not contain a valid checkpoint.  */
#define LSM_CKSUM0_INIT 42
#define LSM_CKSUM1_INIT 42
................................................................................
  int bAutowork;                  /* Configured by LSM_CONFIG_AUTOWORK */
  int nTreeLimit;                 /* Configured by LSM_CONFIG_AUTOFLUSH */
  int nMerge;                     /* Configured by LSM_CONFIG_AUTOMERGE */
  int bUseLog;                    /* Configured by LSM_CONFIG_USE_LOG */
  int nDfltPgsz;                  /* Configured by LSM_CONFIG_PAGE_SIZE */
  int nDfltBlksz;                 /* Configured by LSM_CONFIG_BLOCK_SIZE */
  int nMaxFreelist;               /* Configured by LSM_CONFIG_MAX_FREELIST */
  int bMmap;                      /* Configured by LSM_CONFIG_MMAP */
  i64 nAutockpt;                  /* Configured by LSM_CONFIG_AUTOCHECKPOINT */
  int bMultiProc;                 /* Configured by L_C_MULTIPLE_PROCESSES */
  int bReadonly;                  /* Configured by LSM_CONFIG_READONLY */
  lsm_compress compress;          /* Compression callbacks */
  lsm_compress_factory factory;   /* Compression callback factory */

  /* Sub-system handles */







|







 







|







46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
...
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
#define LSM_DFLT_BLOCK_SIZE         (1 * 1024 * 1024)
#define LSM_DFLT_AUTOFLUSH          (1 * 1024 * 1024)
#define LSM_DFLT_AUTOCHECKPOINT     (i64)(2 * 1024 * 1024)
#define LSM_DFLT_AUTOWORK           1
#define LSM_DFLT_LOG_SIZE           (128*1024)
#define LSM_DFLT_AUTOMERGE          4
#define LSM_DFLT_SAFETY             LSM_SAFETY_NORMAL
#define LSM_DFLT_MMAP               (LSM_IS_64_BIT ? 1 : 32768)
#define LSM_DFLT_MULTIPLE_PROCESSES 1
#define LSM_DFLT_USE_LOG            1

/* Initial values for log file checksums. These are only used if the 
** database file does not contain a valid checkpoint.  */
#define LSM_CKSUM0_INIT 42
#define LSM_CKSUM1_INIT 42
................................................................................
  int bAutowork;                  /* Configured by LSM_CONFIG_AUTOWORK */
  int nTreeLimit;                 /* Configured by LSM_CONFIG_AUTOFLUSH */
  int nMerge;                     /* Configured by LSM_CONFIG_AUTOMERGE */
  int bUseLog;                    /* Configured by LSM_CONFIG_USE_LOG */
  int nDfltPgsz;                  /* Configured by LSM_CONFIG_PAGE_SIZE */
  int nDfltBlksz;                 /* Configured by LSM_CONFIG_BLOCK_SIZE */
  int nMaxFreelist;               /* Configured by LSM_CONFIG_MAX_FREELIST */
  int iMmap;                      /* Configured by LSM_CONFIG_MMAP */
  i64 nAutockpt;                  /* Configured by LSM_CONFIG_AUTOCHECKPOINT */
  int bMultiProc;                 /* Configured by L_C_MULTIPLE_PROCESSES */
  int bReadonly;                  /* Configured by LSM_CONFIG_READONLY */
  lsm_compress compress;          /* Compression callbacks */
  lsm_compress_factory factory;   /* Compression callback factory */

  /* Sub-system handles */

Changes to src/lsm_file.c

153
154
155
156
157
158
159
160
161
162
163
164





165
166
167
168
169
170




171
172





























173





174
175
176
177
178
179
180
...
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207

208
209
210
211
212
213
214
215
216








217
218
219
220
221
222
223
...
241
242
243
244
245
246
247
248
249


250
251
252
253
254
255
256
...
285
286
287
288
289
290
291
























292
293
294
295
296
297
298
...
450
451
452
453
454
455
456








457
458
459
460
461
462
463
...
599
600
601
602
603
604
605

606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621

622







623
624
625
626
627
628
629
630
631



632
633
634
635
636
637
638
639





640
641
642
643
644
645
646
...
882
883
884
885
886
887
888

889
890
891
892
893
894
895
896
897
898
899
900
901

902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
...
928
929
930
931
932
933
934





935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
...
956
957
958
959
960
961
962
963
964
965
966



967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
...
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010

1011
1012
1013
1014
1015
1016

1017
1018
1019
1020
1021
1022
1023
....
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
....
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
....
1342
1343
1344
1345
1346
1347
1348

1349
1350
1351
1352
1353

1354
1355







1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388





1389
1390
1391



1392
1393
1394
1395
1396
1397
1398
....
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
....
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
....
2027
2028
2029
2030
2031
2032
2033
2034
2035
2036
2037
2038
2039
2040
2041
....
2048
2049
2050
2051
2052
2053
2054
2055
2056
2057
2058
2059
2060
2061
2062
....
2070
2071
2072
2073
2074
2075
2076
2077
2078
2079
2080
2081
2082
2083
2084
....
2127
2128
2129
2130
2131
2132
2133

2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151

2152



2153
2154












2155



2156
2157




2158
2159
2160
2161
2162


2163
2164
2165
2166
2167
2168
2169
....
2343
2344
2345
2346
2347
2348
2349
2350
2351
2352
2353
2354
2355
2356
2357
....
2359
2360
2361
2362
2363
2364
2365
2366
2367
2368
2369
2370
2371
2372
2373
....
2419
2420
2421
2422
2423
2424
2425

2426
2427
2428
2429
2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
....
2452
2453
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
2469
2470
2471
2472
2473
2474
2475
2476

2477
2478
2479


2480
2481
2482
2483
2484
2485
2486
....
2572
2573
2574
2575
2576
2577
2578
2579







2580
2581
2582
2583
2584
2585
2586
2587
#include <sys/stat.h>
#include <fcntl.h>

/*
** File-system object. Each database connection allocates a single instance
** of the following structure. It is used for all access to the database and
** log files.
**
** pLruFirst, pLruLast:
**   The first and last entries in a doubly-linked list of pages. The 
**   Page.pLruNext and Page.pLruPrev pointers are used to link the list
**   elements together.





**
**   In mmap() mode, this list contains all currently allocated pages that
**   are carrying pointers into the database file mapping (pMap/nMap). If the
**   file has to be unmapped and then remapped (required to grow the mapping
**   as the file grows), the Page.aData pointers are updated by iterating
**   through the contents of this list.




**
**   In non-mmap() mode, this list is an LRU list of cached pages with 





























**   nRef==0.





*/
struct FileSystem {
  lsm_db *pDb;                    /* Database handle that owns this object */
  lsm_env *pEnv;                  /* Environment pointer */
  char *zDb;                      /* Database file name */
  char *zLog;                     /* Database file name */
  int nMetasize;                  /* Size of meta pages in bytes */
................................................................................
  int szSector;                   /* Database file sector size */

  /* If this is a compressed database, a pointer to the compression methods.
  ** For an uncompressed database, a NULL pointer.  */
  lsm_compress *pCompress;
  u8 *aIBuffer;                   /* Buffer to compress to */
  u8 *aOBuffer;                   /* Buffer to uncompress from */
  int nBuffer;                    /* Allocated size of aBuffer[] in bytes */

  /* mmap() mode things */
  int bUseMmap;                   /* True to use mmap() to access db file */
  void *pMap;                     /* Current mapping of database file */
  i64 nMap;                       /* Bytes mapped at pMap */
  Page *pFree;

  Page *pWaiting;                 /* b-tree pages waiting to be written */

  /* Statistics */
  int nWrite;                     /* Total number of pages written */
  int nRead;                      /* Total number of pages read */


  /* Page cache parameters for non-mmap() mode */
  int nOut;                       /* Number of outstanding pages */
  int nCacheMax;                  /* Configured cache size (in pages) */
  int nCacheAlloc;                /* Current cache size (in pages) */
  Page *pLruFirst;                /* Head of the LRU list */
  Page *pLruLast;                 /* Tail of the LRU list */
  int nHash;                      /* Number of hash slots in hash table */
  Page **apHash;                  /* nHash Hash slots */








};

/*
** Database page handle.
**
** pSeg:
**   When lsmFsSortedAppend() is called on a compressed database, the new
................................................................................
  FileSystem *pFS;                /* File system that owns this page */

  /* Only used in compressed database mode: */
  int nCompress;                  /* Compressed size (or 0 for uncomp. db) */
  int nCompressPrev;              /* Compressed size of prev page */
  Segment *pSeg;                  /* Segment this page will be written to */

  /* Fix this up somehow */
  Page *pNextWaiting;


};

/*
** Meta-data page handle. There are two meta-data pages at the start of
** the database file, each FileSystem.nMetasize bytes in size.
*/
struct MetaPage {
................................................................................
static int IOERR_WRAPPER(int rc){
  if( rc!=LSM_OK ) lsmIoerrBkpt();
  return rc;
}
#else
# define IOERR_WRAPPER(rc) (rc)
#endif

























/*
** Wrappers around the VFS methods of the lsm_env object:
**
**     lsmEnvOpen()
**     lsmEnvRead()
**     lsmEnvWrite()
................................................................................
  zDel = lsmMallocPrintf(pFS->pEnv, "%s-log", pFS->zDb);
  if( zDel ){
    lsmEnvUnlink(pFS->pEnv, zDel);
    lsmFree(pFS->pEnv, zDel);
  }
  return LSM_OK;
}









/*
** Given that there are currently nHash slots in the hash table, return 
** the hash key for file iFile, page iPg.
*/
static int fsHashKey(int nHash, int iPg){
  return (iPg % nHash);
................................................................................
  FileSystem *pFS = db->pFS;
  if( pFS ){
    lsm_env *pEnv = pFS->pEnv;
    Page *pPg;

    assert( pFS->nOut==0 );
    assert( pFS->pWaiting==0 );


    /* Reset any compression/decompression buffers already allocated */
    lsmFree(pEnv, pFS->aIBuffer);
    lsmFree(pEnv, pFS->aOBuffer);
    pFS->nBuffer = 0;

    /* Unmap the file, if it is currently mapped */
    if( pFS->pMap ){
      lsmEnvRemap(pEnv, pFS->fdDb, -1, &pFS->pMap, &pFS->nMap);
      pFS->bUseMmap = 0;
    }

    /* Free all allocate page structures */
    pPg = pFS->pLruFirst;
    while( pPg ){
      Page *pNext = pPg->pLruNext;

      if( pPg->flags & PAGE_FREE ) lsmFree(pEnv, pPg->aData);







      lsmFree(pEnv, pPg);
      pPg = pNext;
    }

    /* Zero pointers that point to deleted page objects */
    pFS->nCacheAlloc = 0;
    pFS->pLruFirst = 0;
    pFS->pLruLast = 0;
    pFS->pFree = 0;




    /* Configure the FileSystem object */
    if( db->compress.xCompress ){
      pFS->pCompress = &db->compress;
      pFS->bUseMmap = 0;
    }else{
      pFS->pCompress = 0;
      pFS->bUseMmap = db->bMmap;





    }
  }

  return LSM_OK;
}

/*
................................................................................
static void fsPageRemoveFromHash(FileSystem *pFS, Page *pPg){
  int iHash;
  Page **pp;

  iHash = fsHashKey(pFS->nHash, pPg->iPg);
  for(pp=&pFS->apHash[iHash]; *pp!=pPg; pp=&(*pp)->pHashNext);
  *pp = pPg->pHashNext;

}


/*
** Purge the page cache of all entries with nRef==0.
*/
void lsmFsPurgeCache(FileSystem *pFS){
  if( pFS->bUseMmap==0 ){
    Page *pPg;

    pPg = pFS->pLruFirst;
    while( pPg ){
      Page *pNext = pPg->pLruNext;

      fsPageRemoveFromHash(pFS, pPg);
      if( pPg->flags & PAGE_FREE ){
        lsmFree(pFS->pEnv, pPg->aData);
      }
      lsmFree(pFS->pEnv, pPg);
      pPg = pNext;
      pFS->nCacheAlloc--;
    }
    pFS->pLruFirst = 0;
    pFS->pLruLast = 0;

    assert( pFS->nCacheAlloc<=pFS->nOut && pFS->nCacheAlloc>=0 );
  }
}

/*
** Search the hash-table for page iPg. If an entry is round, return a pointer
** to it. Otherwise, return NULL.
**
** Either way, if argument piHash is not NULL set *piHash to the hash slot
................................................................................
  if( piHash ) *piHash = iHash;
  for(p=pFS->apHash[iHash]; p; p=p->pHashNext){
    if( p->iPg==iPg) break;
  }
  return p;
}






static int fsPageBuffer(
  FileSystem *pFS, 
  Page **ppOut
){
  int rc = LSM_OK;
  Page *pPage = 0;
  if( pFS->bUseMmap || pFS->pLruFirst==0 || pFS->nCacheAlloc<pFS->nCacheMax ){
    pPage = lsmMallocZero(pFS->pEnv, sizeof(Page));
    if( !pPage ){
      rc = LSM_NOMEM_BKPT;
    }else{
      pPage->aData = (u8 *)lsmMalloc(pFS->pEnv, pFS->nPagesize);
      pPage->flags = PAGE_FREE;
      if( !pPage->aData ){
        lsmFree(pFS->pEnv, pPage);
        rc = LSM_NOMEM_BKPT;
        pPage = 0;
      }
      pFS->nCacheAlloc++;
    }
................................................................................
    u8 *aData;
    pPage = pFS->pLruFirst;
    aData = pPage->aData;
    fsPageRemoveFromLru(pFS, pPage);
    fsPageRemoveFromHash(pFS, pPage);

    memset(pPage, 0, sizeof(Page));
    pPage->flags = PAGE_FREE;
    pPage->aData = aData;
  }




  *ppOut = pPage;
  return rc;
}

static void fsPageBufferFree(Page *pPg){
  if( pPg->flags & PAGE_FREE ){
    lsmFree(pPg->pFS->pEnv, pPg->aData);
  }
  else if( pPg->pFS->bUseMmap ){
    fsPageRemoveFromLru(pPg->pFS, pPg);
  }
  lsmFree(pPg->pFS->pEnv, pPg);
}

static void fsGrowMapping(
  FileSystem *pFS,
  i64 iSz,
  int *pRc
................................................................................
  if( *pRc==LSM_OK && iSz>pFS->nMap ){
    int rc;
    u8 *aOld = pFS->pMap;
    rc = lsmEnvRemap(pFS->pEnv, pFS->fdDb, iSz, &pFS->pMap, &pFS->nMap);
    if( rc==LSM_OK && pFS->pMap!=aOld ){
      Page *pFix;
      i64 iOff = (u8 *)pFS->pMap - aOld;
      for(pFix=pFS->pLruFirst; pFix; pFix=pFix->pLruNext){
        pFix->aData += iOff;
      }
      lsmSortedRemap(pFS->pDb);
    }
    *pRc = rc;
  }
}


/*
** fsync() the database file.
*/
int lsmFsSyncDb(FileSystem *pFS, int nBlock){

  if( nBlock && pFS->bUseMmap ){
    int rc = LSM_OK;
    i64 nMin = (i64)nBlock * (i64)pFS->nBlocksize;
    fsGrowMapping(pFS, nMin, &rc);
    if( rc!=LSM_OK ) return rc;
  }

  return lsmEnvSync(pFS->pEnv, pFS->fdDb);
}

static int fsPageGet(FileSystem *, Segment *, Pgno, int, Page **, int *);

static int fsRedirectBlock(Redirect *p, int iBlk){
  if( p ){
................................................................................
  
  if( pSeg ){
    iRead = fsRedirectBlock(pSeg->pRedirect, iBlock);
  }else{
    iRead = iBlock;
  }

  assert( pFS->bUseMmap==0 || pFS->pCompress==0 );
  if( pFS->pCompress ){
    i64 iOff;                     /* File offset to read data from */
    u8 aNext[4];                  /* 4-byte pointer read from db file */

    iOff = (i64)iRead * pFS->nBlocksize - sizeof(aNext);
    rc = lsmEnvRead(pFS->pEnv, pFS->fdDb, iOff, aNext, sizeof(aNext));
    if( rc==LSM_OK ){
................................................................................
  FileSystem *pFS,                /* File-system object handle */
  Segment *pSeg,                  /* Use this segment for block redirects */
  int iBlock,                     /* Read field from this block */
  int *piPrev                     /* OUT: Previous block in linked list */
){
  int rc = LSM_OK;                /* Return code */

  assert( pFS->bUseMmap==0 || pFS->pCompress==0 );
  assert( iBlock>0 );

  if( pFS->pCompress ){
    i64 iOff = fsFirstPageOnBlock(pFS, iBlock) - 4;
    u8 aPrev[4];                  /* 4-byte pointer read from db file */
    rc = lsmEnvRead(pFS->pEnv, pFS->fdDb, iOff, aPrev, sizeof(aPrev));
    if( rc==LSM_OK ){
................................................................................
  int rc = LSM_OK;

  /* In most cases iReal is the same as iPg. Except, if pSeg->pRedirect is 
  ** not NULL, and the block containing iPg has been redirected, then iReal
  ** is the page number after redirection.  */
  Pgno iReal = lsmFsRedirectPage(pFS, (pSeg ? pSeg->pRedirect : 0), iPg);


  assert( iPg>=fsFirstPageOnBlock(pFS, 1) );
  assert( iReal>=fsFirstPageOnBlock(pFS, 1) );
  *ppPg = 0;

  assert( pFS->bUseMmap==0 || pFS->pCompress==0 );

  if( pFS->bUseMmap ){
    Page *pTest;







    i64 iEnd = (i64)iReal * pFS->nPagesize;
    fsGrowMapping(pFS, iEnd, &rc);
    if( rc!=LSM_OK ) return rc;

    p = 0;
    for(pTest=pFS->pWaiting; pTest; pTest=pTest->pNextWaiting){
      if( pTest->iPg==iReal ){
        assert( iReal==iPg );
        p = pTest;
        p->nRef++;
        *ppPg = p;
        return LSM_OK;
      }
    }
    if( pFS->pFree ){
      p = pFS->pFree;
      pFS->pFree = p->pHashNext;
      assert( p->nRef==0 );
    }else{
      p = lsmMallocZeroRc(pFS->pEnv, sizeof(Page), &rc);
      if( rc ) return rc;
      fsPageAddToLru(pFS, p);
      p->pFS = pFS;
    }
    p->aData = &((u8 *)pFS->pMap)[pFS->nPagesize * (iReal-1)];
    p->iPg = iReal;
    assert( (p->flags & PAGE_FREE)==0 );
  }else{

    /* Search the hash-table for the page */
    iHash = fsHashKey(pFS->nHash, iReal);
    for(p=pFS->apHash[iHash]; p; p=p->pHashNext){
      if( p->iPg==iReal) break;





    }

    if( p==0 ){



      rc = fsPageBuffer(pFS, &p);
      if( rc==LSM_OK ){
        int nSpace = 0;
        p->iPg = iReal;
        p->nRef = 0;
        p->pFS = pFS;
        assert( p->flags==0 || p->flags==PAGE_FREE );
................................................................................
            i64 iOff = (i64)(iReal-1) * pFS->nPagesize;
            rc = lsmEnvRead(pFS->pEnv, pFS->fdDb, iOff, p->aData, nByte);
          }
          pFS->nRead++;
        }

        /* If the xRead() call was successful (or not attempted), link the
         ** page into the page-cache hash-table. Otherwise, if it failed,
         ** free the buffer. */
        if( rc==LSM_OK && nSpace==0 ){
          p->pHashNext = pFS->apHash[iHash];
          pFS->apHash[iHash] = p;
        }else{
          fsPageBufferFree(p);
          p = 0;
          if( pnSpace ) *pnSpace = nSpace;
        }
      }
    }else if( p->nRef==0 ){
      fsPageRemoveFromLru(pFS, p);
    }

    assert( (rc==LSM_OK && (p || (pnSpace && *pnSpace)))
         || (rc!=LSM_OK && p==0) 
    );
  }

................................................................................
** may be garbage. It is the callers responsibility to deal with this.
*/
int lsmFsReadSyncedId(lsm_db *db, int iMeta, i64 *piVal){
  FileSystem *pFS = db->pFS;
  int rc = LSM_OK;

  assert( iMeta==1 || iMeta==2 );
  if( pFS->bUseMmap ){
    fsGrowMapping(pFS, iMeta*LSM_META_PAGE_SIZE, &rc);
    if( rc==LSM_OK ){
      *piVal = (i64)lsmGetU64(&((u8 *)pFS->pMap)[(iMeta-1)*LSM_META_PAGE_SIZE]);
    }
  }else{
    MetaPage *pMeta = 0;
    rc = lsmFsMetaPageGet(pFS, 0, iMeta, &pMeta);
................................................................................
  MetaPage *pPg;
  assert( iPg==1 || iPg==2 );

  pPg = lsmMallocZeroRc(pFS->pEnv, sizeof(Page), &rc);

  if( pPg ){
    i64 iOff = (iPg-1) * pFS->nMetasize;
    if( pFS->bUseMmap ){
      fsGrowMapping(pFS, 2*pFS->nMetasize, &rc);
      pPg->aData = (u8 *)(pFS->pMap) + iOff;
    }else{
      pPg->aData = lsmMallocRc(pFS->pEnv, pFS->nMetasize, &rc);
      if( rc==LSM_OK && bWrite==0 ){
        rc = lsmEnvRead(pFS->pEnv, pFS->fdDb, iOff, pPg->aData, pFS->nMetasize);
      }
................................................................................
      else if( rc==LSM_OK ){
        memset( pPg->aData, 0x77, pFS->nMetasize );
      }
#endif
    }

    if( rc!=LSM_OK ){
      if( pFS->bUseMmap==0 ) lsmFree(pFS->pEnv, pPg->aData);
      lsmFree(pFS->pEnv, pPg);
      pPg = 0;
    }else{
      pPg->iPg = iPg;
      pPg->bWrite = bWrite;
      pPg->pFS = pFS;
    }
................................................................................
** Release a meta-page reference obtained via a call to lsmFsMetaPageGet().
*/
int lsmFsMetaPageRelease(MetaPage *pPg){
  int rc = LSM_OK;
  if( pPg ){
    FileSystem *pFS = pPg->pFS;

    if( pFS->bUseMmap==0 ){
      if( pPg->bWrite ){
        i64 iOff = (pPg->iPg==2 ? pFS->nMetasize : 0);
        int nWrite = pFS->nMetasize;
        rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iOff, pPg->aData, nWrite);
      }
      lsmFree(pFS->pEnv, pPg->aData);
    }
................................................................................
** It is safe to assume that there are no outstanding references to pages 
** on block iTo. And that block iFrom is not currently being written. In
** other words, the data can be read and written directly.
*/
int lsmFsMoveBlock(FileSystem *pFS, Segment *pSeg, int iTo, int iFrom){
  Snapshot *p = pFS->pDb->pWorker;
  int rc = LSM_OK;


  i64 iFromOff = (i64)(iFrom-1) * pFS->nBlocksize;
  i64 iToOff = (i64)(iTo-1) * pFS->nBlocksize;
  
  assert( iTo!=1 );
  assert( iFrom>iTo );

  if( pFS->bUseMmap ){
    fsGrowMapping(pFS, (i64)iFrom * pFS->nBlocksize, &rc);
    if( rc==LSM_OK ){
      u8 *aMap = (u8 *)(pFS->pMap);
      memcpy(&aMap[iToOff], &aMap[iFromOff], pFS->nBlocksize);
    }
  }else{
    int nSz = pFS->nPagesize;
    u8 *aData = (u8 *)lsmMallocRc(pFS->pEnv, nSz, &rc);
    if( rc==LSM_OK ){
      const int nPagePerBlock = (pFS->nBlocksize / pFS->nPagesize);

      int i;



      for(i=0; rc==LSM_OK && i<nPagePerBlock; i++){
        i64 iOff = iFromOff + i*nSz;












        rc = lsmEnvRead(pFS->pEnv, pFS->fdDb, iOff, aData, nSz);



        if( rc==LSM_OK ){
          iOff = iToOff + i*nSz;




          rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iOff, aData, nSz);
        }
      }
      lsmFsPurgeCache(pFS);
    }


  }

  /* Update append-point list if necessary */
  if( rc==LSM_OK ){
    int i;
    for(i=0; i<LSM_APPLIST_SZ; i++){
      if( fsPageToBlock(pFS, p->aiAppend[i])==iFrom ){
................................................................................
  int rc = *pRc;
  Page *pPg;

  pPg = pFS->pWaiting;
  pFS->pWaiting = 0;

  while( pPg ){
    Page *pNext = pPg->pNextWaiting;
    if( rc==LSM_OK ) rc = lsmFsPagePersist(pPg);
    assert( pPg->nRef==1 );
    lsmFsPageRelease(pPg);
    pPg = pNext;
  }
  *pRc = rc;
}
................................................................................
static void fsRemoveHashEntry(FileSystem *pFS, Pgno iPg){
  Page *p;
  int iHash = fsHashKey(pFS->nHash, iPg);

  for(p=pFS->apHash[iHash]; p && p->iPg!=iPg; p=p->pHashNext);

  if( p ){
    assert( p->nRef==0 );
    fsPageRemoveFromHash(pFS, p);
    p->iPg = 0;
    iHash = fsHashKey(pFS->nHash, 0);
    p->pHashNext = pFS->apHash[iHash];
    pFS->apHash[iHash] = p;
  }
}
................................................................................
        ** lsmFsPagePersist() to write an out-of-order page. Instead a page 
        ** number is assigned here so that the page data will be appended
        ** to the current segment.
        */
        Page **pp;
        int iPrev = 0;
        int iNext = 0;


        assert( pPg->pSeg->iFirst );
        assert( pPg->flags & PAGE_FREE );
        assert( (pPg->flags & PAGE_HASPREV)==0 );
        assert( pPg->nData==pFS->nPagesize-4 );

        rc = fsAppendPage(pFS, pPg->pSeg, &pPg->iPg, &iPrev, &iNext);
        if( rc!=LSM_OK ) return rc;

        if( pFS->bUseMmap==0 ){
          int iHash = fsHashKey(pFS->nHash, pPg->iPg);
          fsRemoveHashEntry(pFS, pPg->iPg);
          pPg->pHashNext = pFS->apHash[iHash];
          pFS->apHash[iHash] = pPg;
          assert( pPg->pHashNext==0 || pPg->pHashNext->iPg!=pPg->iPg );
        }

        if( iPrev ){
          assert( iNext==0 );
          memmove(&pPg->aData[4], pPg->aData, pPg->nData);
          lsmPutU32(pPg->aData, iPrev);
          pPg->flags |= PAGE_HASPREV;
          pPg->aData += 4;
................................................................................
        }else{
          int nData = pPg->nData;
          pPg->nData += 4;
          lsmSortedExpandBtreePage(pPg, nData);
        }

        pPg->nRef++;
        for(pp=&pFS->pWaiting; *pp; pp=&(*pp)->pNextWaiting);
        *pp = pPg;
        assert( pPg->pNextWaiting==0 );

      }else{
        i64 iOff;                   /* Offset to write within database file */

        iOff = (i64)pFS->nPagesize * (i64)(pPg->iPg-1);
        if( pFS->bUseMmap==0 ){
          u8 *aData = pPg->aData - (pPg->flags & PAGE_HASPREV);
          rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iOff, aData, pFS->nPagesize);
        }else if( pPg->flags & PAGE_FREE ){
          fsGrowMapping(pFS, iOff + pFS->nPagesize, &rc);
          if( rc==LSM_OK ){
            u8 *aTo = &((u8 *)(pFS->pMap))[iOff];
            u8 *aFrom = pPg->aData - (pPg->flags & PAGE_HASPREV);
            memcpy(aTo, aFrom, pFS->nPagesize);
            lsmFree(pFS->pEnv, aFrom);

            pPg->aData = aTo + (pPg->flags & PAGE_HASPREV);
            pPg->flags &= ~PAGE_FREE;
            fsPageAddToLru(pFS, pPg);


          }
        }

        lsmFsFlushWaiting(pFS, &rc);
        pPg->flags &= ~PAGE_DIRTY;
        pFS->nWrite++;
      }
................................................................................
      assert( pPg->pFS->pCompress 
           || fsIsFirst(pPg->pFS, pPg->iPg)==0 
           || (pPg->flags & PAGE_HASPREV)
      );
      pPg->aData -= (pPg->flags & PAGE_HASPREV);
      pPg->flags &= ~PAGE_HASPREV;

      if( pFS->bUseMmap ){







        pPg->pHashNext = pFS->pFree;
        pFS->pFree = pPg;
      }else{
#if 0
        assert( pPg->pLruNext==0 );
        assert( pPg->pLruPrev==0 );
        fsPageRemoveFromHash(pFS, pPg);
        fsPageBufferFree(pPg);








|
|
|
|
>
>
>
>
>

<
<
<
<
<
>
>
>
>

<
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
>
>
>
>
>







 







|

|
|


<
<
<
<
<
<
|
>

|
<






>
>
>
>
>
>
>
>







 







|
|
>
>







 







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







 







>
>
>
>
>
>
>
>







 







>









|


|



>
|
>
>
>
>
>
>
>









>
>
>




|


|
>
>
>
>
>







 







>




|


<
|

|
|
|
>
|
<
|
<
|
|
|
|
|
|

|
<







 







>
>
>
>
>






|





<







 







<



>
>
>





|
|
|
|
|
|







 







|













>






>







 







|







 







|







 







>




<
>
|
|
>
>
>
>
>
>
>
|
|
|

<
<
<
<
<
<
<
<
<
<
|
|
|
|
|
|
|
<
|
|
|
|
<
<

<
<
<
<
>
>
>
>
>
|
<
<
>
>
>







 







|
|









<
<







 







|







 







|







 







|







 







|







 







>







|
|
|
<
<
|
<
<
<
|
|
>
|
>
>
>
|
|
>
>
>
>
>
>
>
>
>
>
>
>

>
>
>
|
|
>
>
>
>



<

>
>







 







|







 







|







 







>









|
|
|
|
|
|
<







 







|

|





|









>


|
>
>







 







|
>
>
>
>
>
>
>
|







153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170





171
172
173
174
175

176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
...
225
226
227
228
229
230
231
232
233
234
235
236
237






238
239
240
241

242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
...
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
...
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
...
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
...
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
...
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986

987
988
989
990
991
992
993

994

995
996
997
998
999
1000
1001
1002

1003
1004
1005
1006
1007
1008
1009
....
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039

1040
1041
1042
1043
1044
1045
1046
....
1048
1049
1050
1051
1052
1053
1054

1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
....
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
....
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
....
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
....
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449

1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463










1464
1465
1466
1467
1468
1469
1470

1471
1472
1473
1474


1475




1476
1477
1478
1479
1480
1481


1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
....
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519


1520
1521
1522
1523
1524
1525
1526
....
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
....
2118
2119
2120
2121
2122
2123
2124
2125
2126
2127
2128
2129
2130
2131
2132
....
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
2153
....
2161
2162
2163
2164
2165
2166
2167
2168
2169
2170
2171
2172
2173
2174
2175
....
2218
2219
2220
2221
2222
2223
2224
2225
2226
2227
2228
2229
2230
2231
2232
2233
2234
2235


2236



2237
2238
2239
2240
2241
2242
2243
2244
2245
2246
2247
2248
2249
2250
2251
2252
2253
2254
2255
2256
2257
2258
2259
2260
2261
2262
2263
2264
2265
2266
2267
2268
2269
2270

2271
2272
2273
2274
2275
2276
2277
2278
2279
2280
....
2454
2455
2456
2457
2458
2459
2460
2461
2462
2463
2464
2465
2466
2467
2468
....
2470
2471
2472
2473
2474
2475
2476
2477
2478
2479
2480
2481
2482
2483
2484
....
2530
2531
2532
2533
2534
2535
2536
2537
2538
2539
2540
2541
2542
2543
2544
2545
2546
2547
2548
2549
2550
2551
2552

2553
2554
2555
2556
2557
2558
2559
....
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
....
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
#include <sys/stat.h>
#include <fcntl.h>

/*
** File-system object. Each database connection allocates a single instance
** of the following structure. It is used for all access to the database and
** log files.
**
** The database file may be accessed via two methods - using mmap() or using
** read() and write() calls. In the general case both methods are used - a
** prefix of the file is mapped into memory and the remainder accessed using
** read() and write(). This is helpful when accessing very large files (or
** files that may grow very large during the lifetime of a database
** connection) on systems with 32-bit address spaces. However, it also requires
** that this object manage two distinct types of Page objects simultaneously -
** those that carry pointers to the mapped file and those that carry arrays
** populated by read() calls.
**





** pFree:
**   The head of a singly-linked list that containing currently unused Page 
**   structures suitable for use as mmap-page handles. Connected by the
**   Page.pFreeNext pointers.
**

** pMapped:
**   The head of a singly-linked list that contains all pages that currently
**   carry pointers to the mapped region. This is used if the region is
**   every remapped - the pointers carried by existing pages can be adjusted
**   to account for the remapping. Connected by the Page.pMappedNext pointers.
**
** pWaiting:
**   When the upper layer wishes to append a new b-tree page to a segment,
**   it allocates a Page object that carries a malloc'd block of memory -
**   regardless of the mmap-related configuration. The page is not assigned
**   a page number at first. When the upper layer has finished constructing
**   the page contents, it calls lsmFsPagePersist() to assign a page number
**   to it. At this point it is likely that N pages have been written to the
**   segment, the (N+1)th page is still outstanding and the b-tree page is
**   assigned page number (N+2). To avoid writing page (N+2) before page 
**   (N+1), the recently completed b-tree page is held in the singly linked
**   list headed by pWaiting until page (N+1) has been written. 
**
**   Function lsmFsFlushWaiting() is responsible for eventually writing 
**   waiting pages to disk.
**
**
** apHash/nHash:
**   Hash table used to store all Page objects that carry malloc'd arrays,
**   except those b-tree pages that have not yet been assigned page numbers.
**   Once they have been assigned page numbers - they are added to this
**   hash table.
**
**   Hash table overflow chains are connected using the Page.pHashNext
**   pointers.
**
** pLruFirst, pLruLast:
**   The first and last entries in a doubly-linked list of pages. This
**   list contains all pages with malloc'd data that are present in the
**   hash table and have a ref-count of zero.
*/
struct FileSystem {
  lsm_db *pDb;                    /* Database handle that owns this object */
  lsm_env *pEnv;                  /* Environment pointer */
  char *zDb;                      /* Database file name */
  char *zLog;                     /* Database file name */
  int nMetasize;                  /* Size of meta pages in bytes */
................................................................................
  int szSector;                   /* Database file sector size */

  /* If this is a compressed database, a pointer to the compression methods.
  ** For an uncompressed database, a NULL pointer.  */
  lsm_compress *pCompress;
  u8 *aIBuffer;                   /* Buffer to compress to */
  u8 *aOBuffer;                   /* Buffer to uncompress from */
  int nBuffer;                    /* Allocated size of above buffers in bytes */

  /* mmap() page related things */
  i64 nMapLimit;                  /* Maximum bytes of file to map */
  void *pMap;                     /* Current mapping of database file */
  i64 nMap;                       /* Bytes mapped at pMap */






  Page *pFree;                    /* Unused Page structures */
  Page *pMapped;                  /* List of Page structs that point to pMap */

  /* Page cache parameters for non-mmap() pages */

  int nCacheMax;                  /* Configured cache size (in pages) */
  int nCacheAlloc;                /* Current cache size (in pages) */
  Page *pLruFirst;                /* Head of the LRU list */
  Page *pLruLast;                 /* Tail of the LRU list */
  int nHash;                      /* Number of hash slots in hash table */
  Page **apHash;                  /* nHash Hash slots */

  /* Both types of pages */
  Page *pWaiting;                 /* b-tree pages waiting to be written */

  /* Statistics */
  int nOut;                       /* Number of outstanding pages */
  int nWrite;                     /* Total number of pages written */
  int nRead;                      /* Total number of pages read */
};

/*
** Database page handle.
**
** pSeg:
**   When lsmFsSortedAppend() is called on a compressed database, the new
................................................................................
  FileSystem *pFS;                /* File system that owns this page */

  /* Only used in compressed database mode: */
  int nCompress;                  /* Compressed size (or 0 for uncomp. db) */
  int nCompressPrev;              /* Compressed size of prev page */
  Segment *pSeg;                  /* Segment this page will be written to */

  /* Pointers for singly linked lists */
  Page *pWaitingNext;             /* Next page in FileSystem.pWaiting list */
  Page *pFreeNext;                /* Next page in FileSystem.pFree list */
  Page *pMappedNext;              /* Next page in FileSystem.pMapped list */
};

/*
** Meta-data page handle. There are two meta-data pages at the start of
** the database file, each FileSystem.nMetasize bytes in size.
*/
struct MetaPage {
................................................................................
static int IOERR_WRAPPER(int rc){
  if( rc!=LSM_OK ) lsmIoerrBkpt();
  return rc;
}
#else
# define IOERR_WRAPPER(rc) (rc)
#endif

#ifdef NDEBUG
# define assert_lists_are_ok(x)
#else
static Page *fsPageFindInHash(FileSystem *pFS, Pgno iPg, int *piHash);

static void assert_lists_are_ok(FileSystem *pFS){
  Page *p;

  assert( pFS->nMapLimit>=0 );

  /* Check that all pages in the LRU list have nRef==0, pointers to buffers
  ** in heap memory, and corresponding entries in the hash table.  */
  for(p=pFS->pLruFirst; p; p=p->pLruNext){
    assert( p==pFS->pLruFirst || p->pLruPrev!=0 );
    assert( p==pFS->pLruLast || p->pLruNext!=0 );
    assert( p->pLruPrev==0 || p->pLruPrev->pLruNext==p );
    assert( p->pLruNext==0 || p->pLruNext->pLruPrev==p );
    assert( p->nRef==0 );
    assert( p->flags & PAGE_FREE );
    assert( p==fsPageFindInHash(pFS, p->iPg, 0) );
  }
}
#endif

/*
** Wrappers around the VFS methods of the lsm_env object:
**
**     lsmEnvOpen()
**     lsmEnvRead()
**     lsmEnvWrite()
................................................................................
  zDel = lsmMallocPrintf(pFS->pEnv, "%s-log", pFS->zDb);
  if( zDel ){
    lsmEnvUnlink(pFS->pEnv, zDel);
    lsmFree(pFS->pEnv, zDel);
  }
  return LSM_OK;
}

/*
** Return true if page iReal of the database should be accessed using mmap.
** False otherwise.
*/
static int fsMmapPage(FileSystem *pFS, Pgno iReal){
  return ((i64)iReal*pFS->nPagesize <= pFS->nMapLimit);
}

/*
** Given that there are currently nHash slots in the hash table, return 
** the hash key for file iFile, page iPg.
*/
static int fsHashKey(int nHash, int iPg){
  return (iPg % nHash);
................................................................................
  FileSystem *pFS = db->pFS;
  if( pFS ){
    lsm_env *pEnv = pFS->pEnv;
    Page *pPg;

    assert( pFS->nOut==0 );
    assert( pFS->pWaiting==0 );
    assert( pFS->pMapped==0 );

    /* Reset any compression/decompression buffers already allocated */
    lsmFree(pEnv, pFS->aIBuffer);
    lsmFree(pEnv, pFS->aOBuffer);
    pFS->nBuffer = 0;

    /* Unmap the file, if it is currently mapped */
    if( pFS->pMap ){
      lsmEnvRemap(pEnv, pFS->fdDb, -1, &pFS->pMap, &pFS->nMap);
      pFS->nMapLimit = 0;
    }

    /* Free all allocated page structures */
    pPg = pFS->pLruFirst;
    while( pPg ){
      Page *pNext = pPg->pLruNext;
      assert( pPg->flags & PAGE_FREE );
      lsmFree(pEnv, pPg->aData);
      lsmFree(pEnv, pPg);
      pPg = pNext;
    }

    pPg = pFS->pFree;
    while( pPg ){
      Page *pNext = pPg->pFreeNext;
      lsmFree(pEnv, pPg);
      pPg = pNext;
    }

    /* Zero pointers that point to deleted page objects */
    pFS->nCacheAlloc = 0;
    pFS->pLruFirst = 0;
    pFS->pLruLast = 0;
    pFS->pFree = 0;
    if( pFS->apHash ){
      memset(pFS->apHash, 0, pFS->nHash*sizeof(pFS->apHash[0]));
    }

    /* Configure the FileSystem object */
    if( db->compress.xCompress ){
      pFS->pCompress = &db->compress;
      pFS->nMapLimit = 0;
    }else{
      pFS->pCompress = 0;
      if( db->iMmap==1 ){
        /* Unlimited */
        pFS->nMapLimit = (i64)1 << 60;
      }else{
        pFS->nMapLimit = (i64)db->iMmap * 1024;
      }
    }
  }

  return LSM_OK;
}

/*
................................................................................
static void fsPageRemoveFromHash(FileSystem *pFS, Page *pPg){
  int iHash;
  Page **pp;

  iHash = fsHashKey(pFS->nHash, pPg->iPg);
  for(pp=&pFS->apHash[iHash]; *pp!=pPg; pp=&(*pp)->pHashNext);
  *pp = pPg->pHashNext;
  pPg->pHashNext = 0;
}


/*
** Purge the cache of all non-mmap pages with nRef==0.
*/
void lsmFsPurgeCache(FileSystem *pFS){

  Page *pPg;

  pPg = pFS->pLruFirst;
  while( pPg ){
    Page *pNext = pPg->pLruNext;
    assert( pPg->flags & PAGE_FREE );
    fsPageRemoveFromHash(pFS, pPg);

    lsmFree(pFS->pEnv, pPg->aData);

    lsmFree(pFS->pEnv, pPg);
    pPg = pNext;
    pFS->nCacheAlloc--;
  }
  pFS->pLruFirst = 0;
  pFS->pLruLast = 0;

  assert( pFS->nCacheAlloc<=pFS->nOut && pFS->nCacheAlloc>=0 );

}

/*
** Search the hash-table for page iPg. If an entry is round, return a pointer
** to it. Otherwise, return NULL.
**
** Either way, if argument piHash is not NULL set *piHash to the hash slot
................................................................................
  if( piHash ) *piHash = iHash;
  for(p=pFS->apHash[iHash]; p; p=p->pHashNext){
    if( p->iPg==iPg) break;
  }
  return p;
}

/*
** Allocate and return a non-mmap Page object. If there are already 
** nCacheMax such Page objects outstanding, try to recycle an existing 
** Page instead.
*/
static int fsPageBuffer(
  FileSystem *pFS, 
  Page **ppOut
){
  int rc = LSM_OK;
  Page *pPage = 0;
  if( pFS->pLruFirst==0 || pFS->nCacheAlloc<pFS->nCacheMax ){
    pPage = lsmMallocZero(pFS->pEnv, sizeof(Page));
    if( !pPage ){
      rc = LSM_NOMEM_BKPT;
    }else{
      pPage->aData = (u8 *)lsmMalloc(pFS->pEnv, pFS->nPagesize);

      if( !pPage->aData ){
        lsmFree(pFS->pEnv, pPage);
        rc = LSM_NOMEM_BKPT;
        pPage = 0;
      }
      pFS->nCacheAlloc++;
    }
................................................................................
    u8 *aData;
    pPage = pFS->pLruFirst;
    aData = pPage->aData;
    fsPageRemoveFromLru(pFS, pPage);
    fsPageRemoveFromHash(pFS, pPage);

    memset(pPage, 0, sizeof(Page));

    pPage->aData = aData;
  }

  if( pPage ){
    pPage->flags = PAGE_FREE;
  }
  *ppOut = pPage;
  return rc;
}

static void fsPageBufferFree(Page *pPg){
  assert( pPg->pLruNext==0 );
  assert( pPg->pLruPrev==0 );
  assert( pPg->pFS->pLruFirst!=pPg );
  assert( pPg->pFS->pLruLast!=pPg );

  lsmFree(pPg->pFS->pEnv, pPg->aData);
  lsmFree(pPg->pFS->pEnv, pPg);
}

static void fsGrowMapping(
  FileSystem *pFS,
  i64 iSz,
  int *pRc
................................................................................
  if( *pRc==LSM_OK && iSz>pFS->nMap ){
    int rc;
    u8 *aOld = pFS->pMap;
    rc = lsmEnvRemap(pFS->pEnv, pFS->fdDb, iSz, &pFS->pMap, &pFS->nMap);
    if( rc==LSM_OK && pFS->pMap!=aOld ){
      Page *pFix;
      i64 iOff = (u8 *)pFS->pMap - aOld;
      for(pFix=pFS->pMapped; pFix; pFix=pFix->pMappedNext){
        pFix->aData += iOff;
      }
      lsmSortedRemap(pFS->pDb);
    }
    *pRc = rc;
  }
}


/*
** fsync() the database file.
*/
int lsmFsSyncDb(FileSystem *pFS, int nBlock){
#if 0
  if( nBlock && pFS->bUseMmap ){
    int rc = LSM_OK;
    i64 nMin = (i64)nBlock * (i64)pFS->nBlocksize;
    fsGrowMapping(pFS, nMin, &rc);
    if( rc!=LSM_OK ) return rc;
  }
#endif
  return lsmEnvSync(pFS->pEnv, pFS->fdDb);
}

static int fsPageGet(FileSystem *, Segment *, Pgno, int, Page **, int *);

static int fsRedirectBlock(Redirect *p, int iBlk){
  if( p ){
................................................................................
  
  if( pSeg ){
    iRead = fsRedirectBlock(pSeg->pRedirect, iBlock);
  }else{
    iRead = iBlock;
  }

  assert( pFS->nMapLimit==0 || pFS->pCompress==0 );
  if( pFS->pCompress ){
    i64 iOff;                     /* File offset to read data from */
    u8 aNext[4];                  /* 4-byte pointer read from db file */

    iOff = (i64)iRead * pFS->nBlocksize - sizeof(aNext);
    rc = lsmEnvRead(pFS->pEnv, pFS->fdDb, iOff, aNext, sizeof(aNext));
    if( rc==LSM_OK ){
................................................................................
  FileSystem *pFS,                /* File-system object handle */
  Segment *pSeg,                  /* Use this segment for block redirects */
  int iBlock,                     /* Read field from this block */
  int *piPrev                     /* OUT: Previous block in linked list */
){
  int rc = LSM_OK;                /* Return code */

  assert( pFS->nMapLimit==0 || pFS->pCompress==0 );
  assert( iBlock>0 );

  if( pFS->pCompress ){
    i64 iOff = fsFirstPageOnBlock(pFS, iBlock) - 4;
    u8 aPrev[4];                  /* 4-byte pointer read from db file */
    rc = lsmEnvRead(pFS->pEnv, pFS->fdDb, iOff, aPrev, sizeof(aPrev));
    if( rc==LSM_OK ){
................................................................................
  int rc = LSM_OK;

  /* In most cases iReal is the same as iPg. Except, if pSeg->pRedirect is 
  ** not NULL, and the block containing iPg has been redirected, then iReal
  ** is the page number after redirection.  */
  Pgno iReal = lsmFsRedirectPage(pFS, (pSeg ? pSeg->pRedirect : 0), iPg);

  assert_lists_are_ok(pFS);
  assert( iPg>=fsFirstPageOnBlock(pFS, 1) );
  assert( iReal>=fsFirstPageOnBlock(pFS, 1) );
  *ppPg = 0;



  /* Search the hash-table for the page */
  p = fsPageFindInHash(pFS, iReal, &iHash);

  if( p ){
    assert( p->flags & PAGE_FREE );
    if( p->nRef==0 ) fsPageRemoveFromLru(pFS, p);
  }else{

    if( fsMmapPage(pFS, iReal) ){
      i64 iEnd = (i64)iReal * pFS->nPagesize;
      fsGrowMapping(pFS, iEnd, &rc);
      if( rc!=LSM_OK ) return rc;











      if( pFS->pFree ){
        p = pFS->pFree;
        pFS->pFree = p->pFreeNext;
        assert( p->nRef==0 );
      }else{
        p = lsmMallocZeroRc(pFS->pEnv, sizeof(Page), &rc);
        if( rc ) return rc;

        p->pFS = pFS;
      }
      p->aData = &((u8 *)pFS->pMap)[pFS->nPagesize * (iReal-1)];
      p->iPg = iReal;







      /* This page now carries a pointer to the mapping. Link it in to
      ** the FileSystem.pMapped list.  */
      assert( p->pMappedNext==0 );
      p->pMappedNext = pFS->pMapped;
      pFS->pMapped = p;



      assert( pFS->pCompress==0 );
      assert( (p->flags & PAGE_FREE)==0 );
    }else{
      rc = fsPageBuffer(pFS, &p);
      if( rc==LSM_OK ){
        int nSpace = 0;
        p->iPg = iReal;
        p->nRef = 0;
        p->pFS = pFS;
        assert( p->flags==0 || p->flags==PAGE_FREE );
................................................................................
            i64 iOff = (i64)(iReal-1) * pFS->nPagesize;
            rc = lsmEnvRead(pFS->pEnv, pFS->fdDb, iOff, p->aData, nByte);
          }
          pFS->nRead++;
        }

        /* If the xRead() call was successful (or not attempted), link the
        ** page into the page-cache hash-table. Otherwise, if it failed,
        ** free the buffer. */
        if( rc==LSM_OK && nSpace==0 ){
          p->pHashNext = pFS->apHash[iHash];
          pFS->apHash[iHash] = p;
        }else{
          fsPageBufferFree(p);
          p = 0;
          if( pnSpace ) *pnSpace = nSpace;
        }
      }


    }

    assert( (rc==LSM_OK && (p || (pnSpace && *pnSpace)))
         || (rc!=LSM_OK && p==0) 
    );
  }

................................................................................
** may be garbage. It is the callers responsibility to deal with this.
*/
int lsmFsReadSyncedId(lsm_db *db, int iMeta, i64 *piVal){
  FileSystem *pFS = db->pFS;
  int rc = LSM_OK;

  assert( iMeta==1 || iMeta==2 );
  if( pFS->nMapLimit>0 ){
    fsGrowMapping(pFS, iMeta*LSM_META_PAGE_SIZE, &rc);
    if( rc==LSM_OK ){
      *piVal = (i64)lsmGetU64(&((u8 *)pFS->pMap)[(iMeta-1)*LSM_META_PAGE_SIZE]);
    }
  }else{
    MetaPage *pMeta = 0;
    rc = lsmFsMetaPageGet(pFS, 0, iMeta, &pMeta);
................................................................................
  MetaPage *pPg;
  assert( iPg==1 || iPg==2 );

  pPg = lsmMallocZeroRc(pFS->pEnv, sizeof(Page), &rc);

  if( pPg ){
    i64 iOff = (iPg-1) * pFS->nMetasize;
    if( pFS->nMapLimit>0 ){
      fsGrowMapping(pFS, 2*pFS->nMetasize, &rc);
      pPg->aData = (u8 *)(pFS->pMap) + iOff;
    }else{
      pPg->aData = lsmMallocRc(pFS->pEnv, pFS->nMetasize, &rc);
      if( rc==LSM_OK && bWrite==0 ){
        rc = lsmEnvRead(pFS->pEnv, pFS->fdDb, iOff, pPg->aData, pFS->nMetasize);
      }
................................................................................
      else if( rc==LSM_OK ){
        memset( pPg->aData, 0x77, pFS->nMetasize );
      }
#endif
    }

    if( rc!=LSM_OK ){
      if( pFS->nMapLimit==0 ) lsmFree(pFS->pEnv, pPg->aData);
      lsmFree(pFS->pEnv, pPg);
      pPg = 0;
    }else{
      pPg->iPg = iPg;
      pPg->bWrite = bWrite;
      pPg->pFS = pFS;
    }
................................................................................
** Release a meta-page reference obtained via a call to lsmFsMetaPageGet().
*/
int lsmFsMetaPageRelease(MetaPage *pPg){
  int rc = LSM_OK;
  if( pPg ){
    FileSystem *pFS = pPg->pFS;

    if( pFS->nMapLimit==0 ){
      if( pPg->bWrite ){
        i64 iOff = (pPg->iPg==2 ? pFS->nMetasize : 0);
        int nWrite = pFS->nMetasize;
        rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iOff, pPg->aData, nWrite);
      }
      lsmFree(pFS->pEnv, pPg->aData);
    }
................................................................................
** It is safe to assume that there are no outstanding references to pages 
** on block iTo. And that block iFrom is not currently being written. In
** other words, the data can be read and written directly.
*/
int lsmFsMoveBlock(FileSystem *pFS, Segment *pSeg, int iTo, int iFrom){
  Snapshot *p = pFS->pDb->pWorker;
  int rc = LSM_OK;
  i64 nMap;

  i64 iFromOff = (i64)(iFrom-1) * pFS->nBlocksize;
  i64 iToOff = (i64)(iTo-1) * pFS->nBlocksize;
  
  assert( iTo!=1 );
  assert( iFrom>iTo );

  /* Grow the mapping as required. */
  nMap = LSM_MIN(pFS->nMapLimit, (i64)iFrom * pFS->nBlocksize);
  fsGrowMapping(pFS, nMap, &rc);






  if( rc==LSM_OK ){
    const int nPagePerBlock = (pFS->nBlocksize / pFS->nPagesize);
    int nSz = pFS->nPagesize;
    int i;
    u8 *aBuf = 0;
    u8 *aData = 0;

    for(i=0; rc==LSM_OK && i<nPagePerBlock; i++){
      i64 iOff = iFromOff + i*nSz;
      Page *pPg;

      /* Set aData to point to a buffer containing the from page */
      if( (iOff+nSz)<=pFS->nMapLimit ){
        u8 *aMap = (u8 *)(pFS->pMap);
        aData = &aMap[iOff];
      }else{
        if( aBuf==0 ){
          aBuf = (u8 *)lsmMallocRc(pFS->pEnv, nSz, &rc);
          if( aBuf==0 ) break;
        }
        aData = aBuf;
        rc = lsmEnvRead(pFS->pEnv, pFS->fdDb, iOff, aData, nSz);
      }

      /* Copy aData to the to page */
      if( rc==LSM_OK ){
        iOff = iToOff + i*nSz;
        if( (iOff+nSz)<=pFS->nMapLimit ){
          u8 *aMap = (u8 *)(pFS->pMap);
          memcpy(&aMap[iOff], aData, nSz);
        }else{
          rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iOff, aData, nSz);
        }
      }

    }
    lsmFree(pFS->pEnv, aBuf);
    lsmFsPurgeCache(pFS);
  }

  /* Update append-point list if necessary */
  if( rc==LSM_OK ){
    int i;
    for(i=0; i<LSM_APPLIST_SZ; i++){
      if( fsPageToBlock(pFS, p->aiAppend[i])==iFrom ){
................................................................................
  int rc = *pRc;
  Page *pPg;

  pPg = pFS->pWaiting;
  pFS->pWaiting = 0;

  while( pPg ){
    Page *pNext = pPg->pWaitingNext;
    if( rc==LSM_OK ) rc = lsmFsPagePersist(pPg);
    assert( pPg->nRef==1 );
    lsmFsPageRelease(pPg);
    pPg = pNext;
  }
  *pRc = rc;
}
................................................................................
static void fsRemoveHashEntry(FileSystem *pFS, Pgno iPg){
  Page *p;
  int iHash = fsHashKey(pFS->nHash, iPg);

  for(p=pFS->apHash[iHash]; p && p->iPg!=iPg; p=p->pHashNext);

  if( p ){
    assert( p->nRef==0 || (p->flags & PAGE_FREE)==0 );
    fsPageRemoveFromHash(pFS, p);
    p->iPg = 0;
    iHash = fsHashKey(pFS->nHash, 0);
    p->pHashNext = pFS->apHash[iHash];
    pFS->apHash[iHash] = p;
  }
}
................................................................................
        ** lsmFsPagePersist() to write an out-of-order page. Instead a page 
        ** number is assigned here so that the page data will be appended
        ** to the current segment.
        */
        Page **pp;
        int iPrev = 0;
        int iNext = 0;
        int iHash;

        assert( pPg->pSeg->iFirst );
        assert( pPg->flags & PAGE_FREE );
        assert( (pPg->flags & PAGE_HASPREV)==0 );
        assert( pPg->nData==pFS->nPagesize-4 );

        rc = fsAppendPage(pFS, pPg->pSeg, &pPg->iPg, &iPrev, &iNext);
        if( rc!=LSM_OK ) return rc;

        assert( pPg->flags & PAGE_FREE );
        iHash = fsHashKey(pFS->nHash, pPg->iPg);
        fsRemoveHashEntry(pFS, pPg->iPg);
        pPg->pHashNext = pFS->apHash[iHash];
        pFS->apHash[iHash] = pPg;
        assert( pPg->pHashNext==0 || pPg->pHashNext->iPg!=pPg->iPg );


        if( iPrev ){
          assert( iNext==0 );
          memmove(&pPg->aData[4], pPg->aData, pPg->nData);
          lsmPutU32(pPg->aData, iPrev);
          pPg->flags |= PAGE_HASPREV;
          pPg->aData += 4;
................................................................................
        }else{
          int nData = pPg->nData;
          pPg->nData += 4;
          lsmSortedExpandBtreePage(pPg, nData);
        }

        pPg->nRef++;
        for(pp=&pFS->pWaiting; *pp; pp=&(*pp)->pWaitingNext);
        *pp = pPg;
        assert( pPg->pWaitingNext==0 );

      }else{
        i64 iOff;                   /* Offset to write within database file */

        iOff = (i64)pFS->nPagesize * (i64)(pPg->iPg-1);
        if( fsMmapPage(pFS, pPg->iPg)==0 ){
          u8 *aData = pPg->aData - (pPg->flags & PAGE_HASPREV);
          rc = lsmEnvWrite(pFS->pEnv, pFS->fdDb, iOff, aData, pFS->nPagesize);
        }else if( pPg->flags & PAGE_FREE ){
          fsGrowMapping(pFS, iOff + pFS->nPagesize, &rc);
          if( rc==LSM_OK ){
            u8 *aTo = &((u8 *)(pFS->pMap))[iOff];
            u8 *aFrom = pPg->aData - (pPg->flags & PAGE_HASPREV);
            memcpy(aTo, aFrom, pFS->nPagesize);
            lsmFree(pFS->pEnv, aFrom);
            pFS->nCacheAlloc--;
            pPg->aData = aTo + (pPg->flags & PAGE_HASPREV);
            pPg->flags &= ~PAGE_FREE;
            fsPageRemoveFromHash(pFS, pPg);
            pPg->pMappedNext = pFS->pMapped;
            pFS->pMapped = pPg;
          }
        }

        lsmFsFlushWaiting(pFS, &rc);
        pPg->flags &= ~PAGE_DIRTY;
        pFS->nWrite++;
      }
................................................................................
      assert( pPg->pFS->pCompress 
           || fsIsFirst(pPg->pFS, pPg->iPg)==0 
           || (pPg->flags & PAGE_HASPREV)
      );
      pPg->aData -= (pPg->flags & PAGE_HASPREV);
      pPg->flags &= ~PAGE_HASPREV;

      if( (pPg->flags & PAGE_FREE)==0 ){
        /* Removed from mapped list */
        Page **pp;
        for(pp=&pFS->pMapped; (*pp)!=pPg; pp=&(*pp)->pMappedNext);
        *pp = pPg->pMappedNext;
        pPg->pMappedNext = 0;

        /* Add to free list */
        pPg->pFreeNext = pFS->pFree;
        pFS->pFree = pPg;
      }else{
#if 0
        assert( pPg->pLruNext==0 );
        assert( pPg->pLruPrev==0 );
        fsPageRemoveFromHash(pFS, pPg);
        fsPageBufferFree(pPg);

Changes to src/lsm_main.c

92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
...
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
  pDb->nDfltBlksz = LSM_DFLT_BLOCK_SIZE;
  pDb->nMerge = LSM_DFLT_AUTOMERGE;
  pDb->nMaxFreelist = LSM_MAX_FREELIST_ENTRIES;
  pDb->bUseLog = LSM_DFLT_USE_LOG;
  pDb->iReader = -1;
  pDb->iRwclient = -1;
  pDb->bMultiProc = LSM_DFLT_MULTIPLE_PROCESSES;
  pDb->bMmap = LSM_DFLT_MMAP;
  pDb->xLog = xLog;
  pDb->compress.iId = LSM_COMPRESSION_NONE;
  return LSM_OK;
}

lsm_env *lsm_get_env(lsm_db *pDb){
  assert( pDb->pEnv );
................................................................................
      }
      *piVal = pDb->eSafety;
      break;
    }

    case LSM_CONFIG_MMAP: {
      int *piVal = va_arg(ap, int *);
      if( pDb->iReader<0 && *piVal>=0 && *piVal<=1 ){
        pDb->bMmap = *piVal;
        rc = lsmFsConfigure(pDb);
      }
      *piVal = pDb->bMmap;
      break;
    }

    case LSM_CONFIG_USE_LOG: {
      int *piVal = va_arg(ap, int *);
      if( pDb->nTransOpen==0 && (*piVal==0 || *piVal==1) ){
        pDb->bUseLog = *piVal;







|







 







|
|


|







92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
...
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
  pDb->nDfltBlksz = LSM_DFLT_BLOCK_SIZE;
  pDb->nMerge = LSM_DFLT_AUTOMERGE;
  pDb->nMaxFreelist = LSM_MAX_FREELIST_ENTRIES;
  pDb->bUseLog = LSM_DFLT_USE_LOG;
  pDb->iReader = -1;
  pDb->iRwclient = -1;
  pDb->bMultiProc = LSM_DFLT_MULTIPLE_PROCESSES;
  pDb->iMmap = LSM_DFLT_MMAP;
  pDb->xLog = xLog;
  pDb->compress.iId = LSM_COMPRESSION_NONE;
  return LSM_OK;
}

lsm_env *lsm_get_env(lsm_db *pDb){
  assert( pDb->pEnv );
................................................................................
      }
      *piVal = pDb->eSafety;
      break;
    }

    case LSM_CONFIG_MMAP: {
      int *piVal = va_arg(ap, int *);
      if( pDb->iReader<0 && *piVal>=0 ){
        pDb->iMmap = *piVal;
        rc = lsmFsConfigure(pDb);
      }
      *piVal = pDb->iMmap;
      break;
    }

    case LSM_CONFIG_USE_LOG: {
      int *piVal = va_arg(ap, int *);
      if( pDb->nTransOpen==0 && (*piVal==0 || *piVal==1) ){
        pDb->bUseLog = *piVal;