Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Have the low-level b-tree insert routine return BT_BLOCKFULL if a level-0 tree is full.
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: 65642c32ba4415e535a639c71e99c95d8e5120c3
User & Date: dan 2013-11-26 20:35:48.418
Context
2013-11-27
14:40
Allocate blocks of space (not individual pages) within the database file for sub-trees. check-in: 5986afca58 user: dan tags: trunk
2013-11-26
20:35
Have the low-level b-tree insert routine return BT_BLOCKFULL if a level-0 tree is full. check-in: 65642c32ba user: dan tags: trunk
2013-11-25
20:50
Begin adding code for blind-writes. check-in: fc9cdc6ca3 user: dan tags: trunk
Changes
Unified Diff Ignore Whitespace Patch
Changes to src/btInt.h.
18
19
20
21
22
23
24







25
26
27
28
29
30
31

typedef sqlite4_int64 i64;
typedef sqlite4_uint64 u64;
typedef unsigned int u32;
typedef unsigned short u16;
typedef unsigned char u8;









/* Number of elements in an array object. */
#define array_size(x) (sizeof(x)/sizeof(x[0]))

/* Number of read-lock slots in shared memory */
#define BT_NREADER 4








>
>
>
>
>
>
>







18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38

typedef sqlite4_int64 i64;
typedef sqlite4_uint64 u64;
typedef unsigned int u32;
typedef unsigned short u16;
typedef unsigned char u8;

/* 
** Special error codes used internally. These must be distinct from SQLite4
** error codes. Which is easy - SQLite4 error codes are all greater than or
** equal to zero.
*/
#define BT_BLOCKFULL -1


/* Number of elements in an array object. */
#define array_size(x) (sizeof(x)/sizeof(x[0]))

/* Number of read-lock slots in shared memory */
#define BT_NREADER 4

41
42
43
44
45
46
47

48
49
50
51
52
53
54

/* By default blocks are 512K bytes in size. */
#define BT_DEFAULT_BLKSZ (512*1024)

typedef struct BtDbHdr BtDbHdr;
struct BtDbHdr {
  u32 pgsz;                       /* Page size in bytes */

  u32 nPg;                        /* Size of database file in pages */

  u32 iRoot;                      /* B-tree root page */
  u32 iMRoot;                     /* Root page of meta-tree */
  u32 iSRoot;                     /* Root page of schedule-tree */

  u32 iSubRoot;                   /* Root of current sub-tree */







>







48
49
50
51
52
53
54
55
56
57
58
59
60
61
62

/* By default blocks are 512K bytes in size. */
#define BT_DEFAULT_BLKSZ (512*1024)

typedef struct BtDbHdr BtDbHdr;
struct BtDbHdr {
  u32 pgsz;                       /* Page size in bytes */
  u32 blksz;                      /* Block size in bytes */
  u32 nPg;                        /* Size of database file in pages */

  u32 iRoot;                      /* B-tree root page */
  u32 iMRoot;                     /* Root page of meta-tree */
  u32 iSRoot;                     /* Root page of schedule-tree */

  u32 iSubRoot;                   /* Root of current sub-tree */
Changes to src/bt_log.c.
863
864
865
866
867
868
869

870
871
872
873
874
875
876
    memcpy(&hdr, aData, sizeof(BtDbHdrCksum));
    btLogChecksum32(1, (u8*)&hdr, offsetof(BtDbHdrCksum, aCksum), 0, aCksum);
  }

  if( aData==0 || aCksum[0]!=hdr.aCksum[0] || aCksum[1]!=hdr.aCksum[1] ){
    memset(&hdr, 0, sizeof(BtDbHdrCksum));
    hdr.hdr.pgsz = BT_DEFAULT_PGSZ;

    hdr.hdr.nPg = 2;
    hdr.hdr.iRoot = 2;
  }

  memcpy(pHdr, &hdr, sizeof(BtDbHdr));
}








>







863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
    memcpy(&hdr, aData, sizeof(BtDbHdrCksum));
    btLogChecksum32(1, (u8*)&hdr, offsetof(BtDbHdrCksum, aCksum), 0, aCksum);
  }

  if( aData==0 || aCksum[0]!=hdr.aCksum[0] || aCksum[1]!=hdr.aCksum[1] ){
    memset(&hdr, 0, sizeof(BtDbHdrCksum));
    hdr.hdr.pgsz = BT_DEFAULT_PGSZ;
    hdr.hdr.blksz = BT_DEFAULT_BLKSZ;
    hdr.hdr.nPg = 2;
    hdr.hdr.iRoot = 2;
  }

  memcpy(pHdr, &hdr, sizeof(BtDbHdr));
}

Changes to src/bt_main.c.
2430
2431
2432
2433
2434
2435
2436








2437
2438
2439
2440
2441
2442
2443
2444
      btPutU16(&aData[pgsz-6], iWrite);
    }

  }else{
    /* The new entry will not fit on the leaf page. Entries will have
    ** to be shuffled between existing leaves and new leaves may need
    ** to be added to make space for it. */








    if( pCsr->nPg==1 ){
      rc = btExtendTree(pCsr);
    }
    if( rc==SQLITE4_OK ){
      rc = btBalance(pCsr, bLeaf, nKV, apKV);
    }
  }








>
>
>
>
>
>
>
>
|







2430
2431
2432
2433
2434
2435
2436
2437
2438
2439
2440
2441
2442
2443
2444
2445
2446
2447
2448
2449
2450
2451
2452
      btPutU16(&aData[pgsz-6], iWrite);
    }

  }else{
    /* The new entry will not fit on the leaf page. Entries will have
    ** to be shuffled between existing leaves and new leaves may need
    ** to be added to make space for it. */
    bt_db *db = pCsr->base.pDb;
    if( db->bFastInsertOp ){
      BtDbHdr *pHdr = sqlite4BtPagerDbhdr(db->pPager);
      if( pHdr->nSubPg >= (pHdr->blksz / pHdr->pgsz) ){
        rc = BT_BLOCKFULL;
        pHdr->iSubRoot = 0;
      }
    }
    if( rc==SQLITE4_OK && pCsr->nPg==1 ){
      rc = btExtendTree(pCsr);
    }
    if( rc==SQLITE4_OK ){
      rc = btBalance(pCsr, bLeaf, nKV, apKV);
    }
  }

2535
2536
2537
2538
2539
2540
2541


2542
2543
2544
2545
2546
2547

2548
2549





2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577


2578
2579













2580
2581
2582
2583

2584
2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603




















2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614


2615
2616

2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660

2661


2662
2663
2664




2665


2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
      }
    }
  }

  return rc;
}



static int btReplace(
  bt_db *db,                      /* Database handle */
  u32 iRoot,                      /* Root page of b-tree to update */
  const void *pK, int nK,         /* Key to insert */
  const void *pV, int nV          /* Value to insert. (nV<0) -> delete */
){

  int rc;                         /* Return code */
  BtCursor csr;                  /* Cursor object to seek to insert point */






  /* Seek stack cursor csr to the b-tree page that key pK/nK is/would be
  ** stored on.  */
  btCsrSetup(db, iRoot, &csr);
  rc = btCsrSeek(&csr, pK, nK, BT_SEEK_GE, BT_CSRSEEK_UPDATE);

  if( rc==SQLITE4_OK ){
    /* The cursor currently points to an entry with key pK/nK. This call
    ** should therefore replace that entry. So delete it and then re-seek
    ** the cursor.  */
    rc = sqlite4BtDelete(&csr.base);

    if( rc==SQLITE4_OK && nV>=0 ){
      rc = btCsrSeek(&csr, pK, nK, BT_SEEK_GE, BT_CSRSEEK_UPDATE);
      if( rc==SQLITE4_OK ) rc = btErrorBkpt(SQLITE4_CORRUPT);
    }
  }

  if( nV>=0 && (rc==SQLITE4_NOTFOUND || rc==SQLITE4_INEXACT) ){
    /* Insert the new KV pair into the current leaf. */
    KeyValue kv;
    kv.pgno = 0;
    kv.eType = KV_VALUE;
    kv.pK = pK; kv.nK = nK;
    kv.pV = pV; kv.nV = nV;

    rc = btOverflowAssign(db, &kv);
    if( rc==SQLITE4_OK ){


      rc = btInsertAndBalance(&csr, 1, &kv);
    }













    if( kv.eType==KV_CELL ){
      sqlite4_free(db->pEnv, (void*)kv.pV);
    }
  }

  btCsrReset(&csr, 1);

  return rc;
}

static int btAllocateNewRoot(bt_db *db, u32 *piNew){
  u32 iNew = 0;
  BtPage *pPg;
  int rc;

  rc = btAllocateNonOverflow(db, &pPg);
  if( rc==SQLITE4_OK ){
    iNew = sqlite4BtPagePgno(pPg);
    sqlite4BtPageRelease(pPg);
  }

  *piNew = iNew;
  return rc;
}





















static int btFastInsertMaxLevel(
  bt_db *db, 
  BtDbHdr *pHdr, 
  u32 *piLevel
){
  int rc;
  BtCursor csr;

  btCsrSetup(db, pHdr->iMRoot, &csr);
  rc = btCsrEnd(&csr, 1);
  assert( rc!=SQLITE4_INEXACT );


  if( rc==SQLITE4_OK ){
    u8 *aK; int nK;

    rc = sqlite4BtCsrKey(&csr.base, (const void**)&aK, &nK);
    if( rc==SQLITE4_OK ){
      *piLevel = btGetU32(&aK[0]);
    }
  }else if( rc==SQLITE4_NOTFOUND ){
    rc = SQLITE4_OK;
    *piLevel = 0;
  }
  btCsrReset(&csr, 1);

  return rc;
}

static int btFastInsertRoot(
  bt_db *db, 
  BtDbHdr *pHdr, 
  u32 *piRoot
){
  int rc = SQLITE4_OK;
  u32 iSubRoot = 0;

  if( pHdr->iMRoot==0 ){
    rc = btAllocateNewRoot(db, &pHdr->iMRoot);
  }
  iSubRoot = pHdr->iSubRoot;

  /* If the current writable sub-tree is full, start a new one. */
  if( pHdr->nSubPg >= (BT_DEFAULT_BLKSZ / pHdr->pgsz) ){
    iSubRoot = 0;
  }

  /* If no writable sub-tree has been discovered, create one now. */
  if( iSubRoot==0 ){
    u32 iMaxLevel = 0;

    u8 aKey[4];
    u8 aVal[8];

    if( rc==SQLITE4_OK ){
      rc = btFastInsertMaxLevel(db, pHdr, &iMaxLevel);
    }
    if( rc==SQLITE4_OK ){
      rc = btAllocateNewRoot(db, &iSubRoot);
    }

    if( rc==SQLITE4_OK ){


      pHdr->iSubRoot = iSubRoot;
      pHdr->nSubPg = 0;





      btPutU32(aKey, iMaxLevel+1);


      btPutU32(&aVal[0], iSubRoot);
      btPutU32(&aVal[4], 1);
      rc = btReplace(db, pHdr->iMRoot, aKey, 4, aVal, 8);
    }
  }

  *piRoot = iSubRoot;
  return rc;
}








>
>
|





>


>
>
>
>
>



|














|
<








>
>
|
|
>
>
>
>
>
>
>
>
>
>
>
>
>




>




















>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|


|





|

>
>

|
>
|
|
|



<



















<
<
<
<
<


|

<
<
<
<
|
<



>

>
>



>
>
>
>
|
>
>


|







2543
2544
2545
2546
2547
2548
2549
2550
2551
2552
2553
2554
2555
2556
2557
2558
2559
2560
2561
2562
2563
2564
2565
2566
2567
2568
2569
2570
2571
2572
2573
2574
2575
2576
2577
2578
2579
2580
2581
2582
2583
2584

2585
2586
2587
2588
2589
2590
2591
2592
2593
2594
2595
2596
2597
2598
2599
2600
2601
2602
2603
2604
2605
2606
2607
2608
2609
2610
2611
2612
2613
2614
2615
2616
2617
2618
2619
2620
2621
2622
2623
2624
2625
2626
2627
2628
2629
2630
2631
2632
2633
2634
2635
2636
2637
2638
2639
2640
2641
2642
2643
2644
2645
2646
2647
2648
2649
2650
2651
2652
2653
2654
2655
2656
2657
2658
2659
2660
2661
2662
2663
2664
2665
2666
2667
2668
2669
2670
2671
2672
2673
2674
2675
2676

2677
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
2693
2694
2695





2696
2697
2698
2699




2700

2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
2712
2713
2714
2715
2716
2717
2718
2719
2720
2721
2722
2723
2724
2725
2726
2727
      }
    }
  }

  return rc;
}

static int btFastInsertRoot(bt_db *db, BtDbHdr *pHdr, u32 *piRoot);

static int btReplaceEntry(
  bt_db *db,                      /* Database handle */
  u32 iRoot,                      /* Root page of b-tree to update */
  const void *pK, int nK,         /* Key to insert */
  const void *pV, int nV          /* Value to insert. (nV<0) -> delete */
){
  BtDbHdr *pHdr = sqlite4BtPagerDbhdr(db->pPager);
  int rc;                         /* Return code */
  BtCursor csr;                  /* Cursor object to seek to insert point */
  u32 iRootPg = iRoot;

  if( rc==SQLITE4_OK && iRoot==0 ){
    rc = btFastInsertRoot(db, pHdr, &iRootPg);
  }

  /* Seek stack cursor csr to the b-tree page that key pK/nK is/would be
  ** stored on.  */
  btCsrSetup(db, iRootPg, &csr);
  rc = btCsrSeek(&csr, pK, nK, BT_SEEK_GE, BT_CSRSEEK_UPDATE);

  if( rc==SQLITE4_OK ){
    /* The cursor currently points to an entry with key pK/nK. This call
    ** should therefore replace that entry. So delete it and then re-seek
    ** the cursor.  */
    rc = sqlite4BtDelete(&csr.base);

    if( rc==SQLITE4_OK && nV>=0 ){
      rc = btCsrSeek(&csr, pK, nK, BT_SEEK_GE, BT_CSRSEEK_UPDATE);
      if( rc==SQLITE4_OK ) rc = btErrorBkpt(SQLITE4_CORRUPT);
    }
  }

  if( nV>=0 && (rc==SQLITE4_NOTFOUND || rc==SQLITE4_INEXACT ) ){

    KeyValue kv;
    kv.pgno = 0;
    kv.eType = KV_VALUE;
    kv.pK = pK; kv.nK = nK;
    kv.pV = pV; kv.nV = nV;

    rc = btOverflowAssign(db, &kv);
    if( rc==SQLITE4_OK ){
      do{
        /* Insert the new KV pair into the current leaf. */
        rc = btInsertAndBalance(&csr, 1, &kv);

        /* Unless this is a block-full error, break out of the loop */
        if( rc!=BT_BLOCKFULL ) break;
        assert( iRoot==0 );

        rc = btFastInsertRoot(db, pHdr, &iRootPg);
        if( rc==SQLITE4_OK ){
          btCsrReset(&csr, 1);
          btCsrSetup(db, iRootPg, &csr);
          rc = btCsrSeek(&csr, pK, nK, BT_SEEK_GE, BT_CSRSEEK_UPDATE);
        }
      }while( rc==SQLITE4_NOTFOUND || rc==SQLITE4_INEXACT );
    }

    if( kv.eType==KV_CELL ){
      sqlite4_free(db->pEnv, (void*)kv.pV);
    }
  }

  btCsrReset(&csr, 1);

  return rc;
}

static int btAllocateNewRoot(bt_db *db, u32 *piNew){
  u32 iNew = 0;
  BtPage *pPg;
  int rc;

  rc = btAllocateNonOverflow(db, &pPg);
  if( rc==SQLITE4_OK ){
    iNew = sqlite4BtPagePgno(pPg);
    sqlite4BtPageRelease(pPg);
  }

  *piNew = iNew;
  return rc;
}

static int btDecodeMetatreeKey(
  BtCursor *pCsr,
  u32 *piAge,
  u32 *piLevel,
  u8 **paKey,
  int *pnKey
){
  u8 *aK; int nK;
  int rc = sqlite4BtCsrKey((bt_cursor*)pCsr, (const void**)&aK, &nK);
  if( rc==SQLITE4_OK ){
    *piAge = btGetU32(&aK[0]);
    *piLevel = ~btGetU32(&aK[4]);
    if( paKey ){
      *paKey = &aK[8];
      *pnKey = nK-8;
    }
  }
  return rc;
}

static int btFindNextLevel(
  bt_db *db, 
  BtDbHdr *pHdr, 
  u32 *piNext
){
  int rc;
  BtCursor csr;

  btCsrSetup(db, pHdr->iMRoot, &csr);
  rc = btCsrEnd(&csr, 0);
  assert( rc!=SQLITE4_INEXACT );

  *piNext = 0;
  if( rc==SQLITE4_OK ){
    u32 iAge;
    u32 iLevel;
    rc = btDecodeMetatreeKey(&csr, &iAge, &iLevel, 0, 0);
    if( rc==SQLITE4_OK && iAge==0 ){
      *piNext = iLevel+1;
    }
  }else if( rc==SQLITE4_NOTFOUND ){
    rc = SQLITE4_OK;

  }
  btCsrReset(&csr, 1);

  return rc;
}

static int btFastInsertRoot(
  bt_db *db, 
  BtDbHdr *pHdr, 
  u32 *piRoot
){
  int rc = SQLITE4_OK;
  u32 iSubRoot = 0;

  if( pHdr->iMRoot==0 ){
    rc = btAllocateNewRoot(db, &pHdr->iMRoot);
  }
  iSubRoot = pHdr->iSubRoot;






  /* If no writable sub-tree has been discovered, create one now. */
  if( iSubRoot==0 ){
    u32 iLevel;





    rc = btFindNextLevel(db, pHdr, &iLevel);

    if( rc==SQLITE4_OK ){
      rc = btAllocateNewRoot(db, &iSubRoot);
    }

    if( rc==SQLITE4_OK ){
      u8 aKey[8];
      u8 aVal[8];
      pHdr->iSubRoot = iSubRoot;
      pHdr->nSubPg = 0;

      /* The key for the new entry consists of the concatentation of two 
      ** 32-bit big-endian integers - the <age> and <level-no>. The age
      ** of the new segment is 0. The level number is one greater than the
      ** level number of the previous segment.  */
      btPutU32(&aKey[0], 0);
      btPutU32(&aKey[4], ~iLevel);

      btPutU32(&aVal[0], iSubRoot);
      btPutU32(&aVal[4], 1);
      rc = btReplaceEntry(db, pHdr->iMRoot, aKey, 8, aVal, 8);
    }
  }

  *piRoot = iSubRoot;
  return rc;
}

2690
2691
2692
2693
2694
2695
2696
2697
2698
2699
2700
2701
2702
2703
2704
2705
2706
2707
2708
2709
2710
2711
  assert( rc!=SQLITE4_NOTFOUND && rc!=SQLITE4_INEXACT );
  btCheckPageRefs(db);

  /* Seek stack cursor csr to the b-tree page that key pK/nK is/would be
  ** stored on.  */
  if( rc==SQLITE4_OK ){
    BtDbHdr *pHdr = sqlite4BtPagerDbhdr(db->pPager);
    u32 iRoot;
    if( db->bFastInsertOp ){
      rc = btFastInsertRoot(db, pHdr, &iRoot);
    }else{
      iRoot = pHdr->iRoot;
    }
    if( rc==SQLITE4_OK ){
      rc = btReplace(db, iRoot, pK, nK, pV, nV);
    }
  }

  btCheckPageRefs(db);
  db->bFastInsertOp = 0;
  return rc;
}







|
|
<
<



|







2742
2743
2744
2745
2746
2747
2748
2749
2750


2751
2752
2753
2754
2755
2756
2757
2758
2759
2760
2761
  assert( rc!=SQLITE4_NOTFOUND && rc!=SQLITE4_INEXACT );
  btCheckPageRefs(db);

  /* Seek stack cursor csr to the b-tree page that key pK/nK is/would be
  ** stored on.  */
  if( rc==SQLITE4_OK ){
    BtDbHdr *pHdr = sqlite4BtPagerDbhdr(db->pPager);
    u32 iRoot = 0;
    if( !db->bFastInsertOp ){


      iRoot = pHdr->iRoot;
    }
    if( rc==SQLITE4_OK ){
      rc = btReplaceEntry(db, iRoot, pK, nK, pV, nV);
    }
  }

  btCheckPageRefs(db);
  db->bFastInsertOp = 0;
  return rc;
}
Changes to src/bt_pager.c.
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097

1098
1099
1100
1101
1102
1103
1104
1105
** or SQLITE4_OK otherwise.
*/
int sqlite4BtPagerHdrdump(BtPager *pPager, sqlite4_buffer *pBuf){
  BtDbHdr *pHdr = pPager->pHdr;
  int rc = SQLITE4_OK;

  sqlite4BtBufAppendf(pBuf, 
      "pgsz=%d nPg=%d"
      " iRoot=%d iMRoot=%d iSRoot=%d"
      " iCookie=%d iFreePg=%d iFreeBlk=%d",

      pHdr->pgsz, pHdr->nPg, pHdr->iRoot, pHdr->iMRoot, pHdr->iSRoot,
      pHdr->iCookie, pHdr->iFreePg, pHdr->iFreeBlk
  );

  return rc;
}

#ifndef NDEBUG







|


>
|







1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
** or SQLITE4_OK otherwise.
*/
int sqlite4BtPagerHdrdump(BtPager *pPager, sqlite4_buffer *pBuf){
  BtDbHdr *pHdr = pPager->pHdr;
  int rc = SQLITE4_OK;

  sqlite4BtBufAppendf(pBuf, 
      "pgsz=%d blksz=%d nPg=%d"
      " iRoot=%d iMRoot=%d iSRoot=%d"
      " iCookie=%d iFreePg=%d iFreeBlk=%d",
      pHdr->pgsz, pHdr->blksz, pHdr->nPg, 
      pHdr->iRoot, pHdr->iMRoot, pHdr->iSRoot,
      pHdr->iCookie, pHdr->iFreePg, pHdr->iFreeBlk
  );

  return rc;
}

#ifndef NDEBUG
Changes to www/bt.wiki.
630
631
632
633
634
635
636















637
638
639
640
641
642
643
  <li><p>Each segment is assigned to a level. Each level contains a 
  set of non-overlapping segments. All segments in level N are newer
  than those in segment (N+1).

  <li><p>Segments are linked together using a tree-like data structure
  stored on regular database pages (mixed in with b-tree data pages).
</ul>
















<h2 id=in-memory_tree>3.2. In-Memory Tree</h2>

<p>This design uses the same multi-version in-memory tree that LSM does. 
The tree structure may be located in heap or shared memory. If shared memory
is used then a separate file (*-shm2 or some such) is used for the data. The 
tree-header used by LSM to identify the latest tree version is combined with







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
  <li><p>Each segment is assigned to a level. Each level contains a 
  set of non-overlapping segments. All segments in level N are newer
  than those in segment (N+1).

  <li><p>Segments are linked together using a tree-like data structure
  stored on regular database pages (mixed in with b-tree data pages).
</ul>

<h3> Meta-Tree Format</h3>

<p> The meta-tree is a b-tree stored in the same format as the main user
b-tree. It serves as an index for all segments in the database. The keys
in the meta-tree are formated as follows:

<ul>
  <li> Segment age (big-endian 32-bit integer).
  <li> Level number (ones-compliment of value as a big-endian 32-bit integer).
  <li> Separator key. All keys within the segment are guaranteed to be as
       large or larger than the separator key. The separator key may be
       zero bytes in size.
</ul>


<h2 id=in-memory_tree>3.2. In-Memory Tree</h2>

<p>This design uses the same multi-version in-memory tree that LSM does. 
The tree structure may be located in heap or shared memory. If shared memory
is used then a separate file (*-shm2 or some such) is used for the data. The 
tree-header used by LSM to identify the latest tree version is combined with