/ Check-in [0539c2d2]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Fix alignment problems in btree and pager and allow page sizes that are not a multiple of 8. (CVS 2026)
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: 0539c2d2b8e16efcbe4db3afeae9c7b426e11b05
User & Date: drh 2004-10-22 16:22:58
Context
2004-10-22
20:29
Add the experimental and scary pragma "writable_schema". (CVS 2027) check-in: 39f7870a user: drh tags: trunk
16:22
Fix alignment problems in btree and pager and allow page sizes that are not a multiple of 8. (CVS 2026) check-in: 0539c2d2 user: drh tags: trunk
2004-10-19
16:40
Reinsert code deleted by (1998) that we thought was unused but was in fact needed. Fix for ticket #966. (CVS 2025) check-in: 370ca539 user: drh tags: trunk
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to src/btree.c.

5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
...
207
208
209
210
211
212
213





214
215
216
217
218
219
220
...
296
297
298
299
300
301
302

303
304
305
306
307
308
309
...
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
...
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
...
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
...
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
...
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
....
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
....
1074
1075
1076
1077
1078
1079
1080

1081
1082
1083
1084
1085
1086
1087
....
1144
1145
1146
1147
1148
1149
1150

1151
1152
1153
1154
1155
1156
1157
....
1195
1196
1197
1198
1199
1200
1201

1202
1203
1204
1205
1206
1207
1208
....
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
....
2669
2670
2671
2672
2673
2674
2675
2676
2677
2678
2679
2680
2681
2682
2683
....
2956
2957
2958
2959
2960
2961
2962
2963
2964
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
....
3037
3038
3039
3040
3041
3042
3043
3044
3045
3046
3047
3048
3049
3050
3051
3052
3053
3054
....
3084
3085
3086
3087
3088
3089
3090
3091
3092
3093
3094
3095
3096
3097
3098
....
3268
3269
3270
3271
3272
3273
3274
3275
3276
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
** a legal notice, here is a blessing:
**
**    May you do good and not evil.
**    May you find forgiveness for yourself and forgive others.
**    May you share freely, never taking more than you give.
**
*************************************************************************
** $Id: btree.c,v 1.192 2004/10/05 02:41:42 drh Exp $
**
** This file implements a external (disk-based) database using BTrees.
** For a detailed discussion of BTrees, refer to
**
**     Donald E. Knuth, THE ART OF COMPUTER PROGRAMMING, Volume 3:
**     "Sorting And Searching", pages 473-480. Addison-Wesley
**     Publishing Company, Reading, Massachusetts.
................................................................................
*/
#include "sqliteInt.h"
#include "pager.h"
#include "btree.h"
#include "os.h"
#include <assert.h>







/* The following value is the maximum cell size assuming a maximum page
** size give above.
*/
#define MX_CELL_SIZE(pBt)  (pBt->pageSize-8)

/* The maximum number of cells on a single page of the database.  This
................................................................................
  u8 inStmt;            /* True if we are in a statement subtransaction */
  u8 readOnly;          /* True if the underlying file is readonly */
  u8 maxEmbedFrac;      /* Maximum payload as % of total page size */
  u8 minEmbedFrac;      /* Minimum payload as % of total page size */
  u8 minLeafFrac;       /* Minimum leaf payload as % of total page size */
  u8 pageSizeFixed;     /* True if the page size can no longer be changed */
  u16 pageSize;         /* Total number of bytes on a page */

  u16 usableSize;       /* Number of usable bytes on each page */
  int maxLocal;         /* Maximum local payload in non-LEAFDATA tables */
  int minLocal;         /* Minimum local payload in non-LEAFDATA tables */
  int maxLeaf;          /* Maximum local payload in a LEAFDATA table */
  int minLeaf;          /* Minimum local payload in a LEAFDATA table */
};
typedef Btree Bt;
................................................................................
  int cellOffset;
  int nCell, cellLimit;
  u8 *used;

  used = sqliteMallocRaw( pPage->pBt->pageSize );
  if( used==0 ) return;
  usableSize = pPage->pBt->usableSize;
  assert( pPage->aData==&((unsigned char*)pPage)[-pPage->pBt->pageSize] );
  hdr = pPage->hdrOffset;
  assert( hdr==(pPage->pgno==1 ? 100 : 0) );
  assert( pPage->pgno==sqlite3pager_pagenumber(pPage->aData) );
  c = pPage->aData[hdr];
  if( pPage->isInit ){
    assert( pPage->leaf == ((c & PTF_LEAF)!=0) );
    assert( pPage->zeroData == ((c & PTF_ZERODATA)!=0) );
................................................................................
  int nFree;         /* Number of unused bytes on the page */
  int top;           /* First byte of the cell content area */

  pBt = pPage->pBt;
  assert( pBt!=0 );
  assert( pParent==0 || pParent->pBt==pBt );
  assert( pPage->pgno==sqlite3pager_pagenumber(pPage->aData) );
  assert( pPage->aData == &((unsigned char*)pPage)[-pBt->pageSize] );
  if( pPage->pParent!=pParent && (pPage->pParent!=0 || pPage->isInit) ){
    /* The parent page should never change unless the file is corrupt */
    return SQLITE_CORRUPT; /* bkpt-CORRUPT */
  }
  if( pPage->isInit ) return SQLITE_OK;
  if( pPage->pParent==0 && pParent!=0 ){
    pPage->pParent = pParent;
................................................................................
static void zeroPage(MemPage *pPage, int flags){
  unsigned char *data = pPage->aData;
  Btree *pBt = pPage->pBt;
  int hdr = pPage->hdrOffset;
  int first;

  assert( sqlite3pager_pagenumber(data)==pPage->pgno );
  assert( &data[pBt->pageSize] == (unsigned char*)pPage );
  assert( sqlite3pager_iswriteable(data) );
  memset(&data[hdr], 0, pBt->usableSize - hdr);
  data[hdr] = flags;
  first = hdr + 8 + 4*((flags&PTF_LEAF)==0);
  memset(&data[hdr+1], 0, 4);
  data[hdr+7] = 0;
  put2byte(&data[hdr+5], pBt->usableSize);
................................................................................
*/
static int getPage(Btree *pBt, Pgno pgno, MemPage **ppPage){
  int rc;
  unsigned char *aData;
  MemPage *pPage;
  rc = sqlite3pager_get(pBt->pPager, pgno, (void**)&aData);
  if( rc ) return rc;
  pPage = (MemPage*)&aData[pBt->pageSize];
  pPage->aData = aData;
  pPage->pBt = pBt;
  pPage->pgno = pgno;
  pPage->hdrOffset = pPage->pgno==1 ? 100 : 0;
  *ppPage = pPage;
  return SQLITE_OK;
}
................................................................................
** Release a MemPage.  This should be called once for each prior
** call to getPage.
*/
static void releasePage(MemPage *pPage){
  if( pPage ){
    assert( pPage->aData );
    assert( pPage->pBt );
    assert( &pPage->aData[pPage->pBt->pageSize]==(unsigned char*)pPage );
    sqlite3pager_unref(pPage->aData);
  }
}

/*
** This routine is called when the reference count for a page
** reaches zero.  We need to unref the pParent pointer when that
** happens.
*/
static void pageDestructor(void *pData, int pageSize){
  MemPage *pPage = (MemPage*)&((char*)pData)[pageSize];
  if( pPage->pParent ){
    MemPage *pParent = pPage->pParent;
    pPage->pParent = 0;
    releasePage(pParent);
  }
  pPage->isInit = 0;
}
................................................................................
** so that the cache is restored to its original state at the start of
** the transaction, for each page restored this routine is called.
**
** This routine needs to reset the extra data section at the end of the
** page to agree with the restored data.
*/
static void pageReinit(void *pData, int pageSize){
  MemPage *pPage = (MemPage*)&((char*)pData)[pageSize];
  if( pPage->isInit ){
    pPage->isInit = 0;
    initPage(pPage, pPage->pParent);
  }
}

/*
................................................................................
    nReserve = zDbHeader[20];
    pBt->maxEmbedFrac = zDbHeader[21];
    pBt->minEmbedFrac = zDbHeader[22];
    pBt->minLeafFrac = zDbHeader[23];
    pBt->pageSizeFixed = 1;
  }
  pBt->usableSize = pBt->pageSize - nReserve;

  sqlite3pager_set_pagesize(pBt->pPager, pBt->pageSize);
  *ppBtree = pBt;
  return SQLITE_OK;
}

/*
** Close an open database and invalidate all cursors.
................................................................................
    return SQLITE_READONLY;
  }
  if( nReserve<0 ){
    nReserve = pBt->pageSize - pBt->usableSize;
  }
  if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE ){
    pBt->pageSize = pageSize;

    sqlite3pager_set_pagesize(pBt->pPager, pageSize);
  }
  pBt->usableSize = pBt->pageSize - nReserve;
  return SQLITE_OK;
}

/*
................................................................................
      goto page1_init_failed;
    }
    pBt->pageSize = get2byte(&page1[16]);
    pBt->usableSize = pBt->pageSize - page1[20];
    if( pBt->usableSize<500 ){
      goto page1_init_failed;
    }

    pBt->maxEmbedFrac = page1[21];
    pBt->minEmbedFrac = page1[22];
    pBt->minLeafFrac = page1[23];
  }

  /* maxLocal is the maximum amount of payload to store locally for
  ** a cell.  Make sure it is small enough so that at least minFanout
................................................................................
**
** If there is a transaction in progress, this routine is a no-op.
*/
static void unlockBtreeIfUnused(Btree *pBt){
  if( pBt->inTrans==TRANS_NONE && pBt->pCursor==0 && pBt->pPage1!=0 ){
    if( pBt->pPage1->aData==0 ){
      MemPage *pPage = pBt->pPage1;
      pPage->aData = &((char*)pPage)[-pBt->pageSize];
      pPage->pBt = pBt;
      pPage->pgno = 1;
    }
    releasePage(pBt->pPage1);
    pBt->pPage1 = 0;
    pBt->inStmt = 0;
  }
................................................................................
  MemPage *pThis;
  unsigned char *aData;

  if( pgno==0 ) return;
  assert( pBt->pPager!=0 );
  aData = sqlite3pager_lookup(pBt->pPager, pgno);
  if( aData ){
    pThis = (MemPage*)&aData[pBt->pageSize];
    assert( pThis->aData==aData );
    if( pThis->isInit ){
      if( pThis->pParent!=pNewParent ){
        if( pThis->pParent ) sqlite3pager_unref(pThis->pParent->aData);
        pThis->pParent = pNewParent;
        if( pNewParent ) sqlite3pager_ref(pNewParent->aData);
      }
................................................................................
  /*
  ** Allocate space for memory structures
  */
  mxCellPerPage = MX_CELL(pBt);
  apCell = sqliteMallocRaw( 
       (mxCellPerPage+2)*NB*(sizeof(u8*)+sizeof(int))
     + sizeof(MemPage)*NB
     + pBt->pageSize*(5+NB)
  );
  if( apCell==0 ){
    return SQLITE_NOMEM;
  }
  szCell = (int*)&apCell[(mxCellPerPage+2)*NB];
  aCopy[0] = (u8*)&szCell[(mxCellPerPage+2)*NB];
  for(i=1; i<NB; i++){
    aCopy[i] = &aCopy[i-1][pBt->pageSize+sizeof(MemPage)];
  }
  aSpace = &aCopy[NB-1][pBt->pageSize+sizeof(MemPage)];
  
  /*
  ** Find the cell in the parent page whose left child points back
  ** to pPage.  The "idx" variable is the index of that cell.  If pPage
  ** is the rightmost child of pParent then set idx to pParent->nCell 
  */
  if( pParent->idxShift ){
................................................................................
  /*
  ** Make copies of the content of pPage and its siblings into aOld[].
  ** The rest of this function will use data from the copies rather
  ** that the original pages since the original pages will be in the
  ** process of being overwritten.
  */
  for(i=0; i<nOld; i++){
    MemPage *p = apCopy[i] = (MemPage*)&aCopy[i][pBt->pageSize];
    p->aData = &((u8*)p)[-pBt->pageSize];
    memcpy(p->aData, apOld[i]->aData, pBt->pageSize + sizeof(MemPage));
    p->aData = &((u8*)p)[-pBt->pageSize];
  }

  /*
  ** Load pointers to all cells on sibling pages and the divider cells
  ** into the local apCell[] array.  Make copies of the divider cells
  ** into space obtained form aSpace[] and remove the the divider Cells
  ** from pParent.
................................................................................
        */
        dropCell(pParent, nxDiv, sz);
      }else{
        u8 *pTemp;
        szCell[nCell] = sz;
        pTemp = &aSpace[iSpace];
        iSpace += sz;
        assert( iSpace<=pBt->pageSize*5 );
        memcpy(pTemp, apDiv[i], sz);
        apCell[nCell] = pTemp+leafCorrection;
        dropCell(pParent, nxDiv, sz);
        szCell[nCell] -= leafCorrection;
        assert( get4byte(pTemp)==pgnoOld[i] );
        if( !pOld->leaf ){
          assert( leafCorrection==0 );
................................................................................
      }else if( leafData ){
        CellInfo info;
        j--;
        parseCellPtr(pNew, apCell[j], &info);
        pCell = &aSpace[iSpace];
        fillInCell(pParent, pCell, 0, info.nKey, 0, 0, &sz);
        iSpace += sz;
        assert( iSpace<=pBt->pageSize*5 );
        pTemp = 0;
      }else{
        pCell -= 4;
        pTemp = &aSpace[iSpace];
        iSpace += sz;
        assert( iSpace<=pBt->pageSize*5 );
      }
      insertCell(pParent, nxDiv, pCell, sz, pTemp);
      put4byte(findOverflowCell(pParent,nxDiv), pNew->pgno);
      j++;
      nxDiv++;
    }
  }







|







 







>
>
>
>
>







 







>







 







|







 







|







 







|







 







|







 







|










|







 







|







 







>







 







>







 







>







 







|







 







|







 







|







|

|







 







|
|
|
|







 







|







 







|





|







5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
...
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
...
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
...
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
...
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
...
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
...
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
...
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
....
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
....
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
....
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
....
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
....
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
....
2678
2679
2680
2681
2682
2683
2684
2685
2686
2687
2688
2689
2690
2691
2692
....
2965
2966
2967
2968
2969
2970
2971
2972
2973
2974
2975
2976
2977
2978
2979
2980
2981
2982
2983
2984
2985
2986
2987
2988
2989
....
3046
3047
3048
3049
3050
3051
3052
3053
3054
3055
3056
3057
3058
3059
3060
3061
3062
3063
....
3093
3094
3095
3096
3097
3098
3099
3100
3101
3102
3103
3104
3105
3106
3107
....
3277
3278
3279
3280
3281
3282
3283
3284
3285
3286
3287
3288
3289
3290
3291
3292
3293
3294
3295
3296
3297
** a legal notice, here is a blessing:
**
**    May you do good and not evil.
**    May you find forgiveness for yourself and forgive others.
**    May you share freely, never taking more than you give.
**
*************************************************************************
** $Id: btree.c,v 1.193 2004/10/22 16:22:58 drh Exp $
**
** This file implements a external (disk-based) database using BTrees.
** For a detailed discussion of BTrees, refer to
**
**     Donald E. Knuth, THE ART OF COMPUTER PROGRAMMING, Volume 3:
**     "Sorting And Searching", pages 473-480. Addison-Wesley
**     Publishing Company, Reading, Massachusetts.
................................................................................
*/
#include "sqliteInt.h"
#include "pager.h"
#include "btree.h"
#include "os.h"
#include <assert.h>

/*
** This macro rounds values up so that if the value is an address it
** is guaranteed to be an address that is aligned to an 8-byte boundary.
*/
#define FORCE_ALIGNMENT(X)   (((X)+7)&~7)

/* The following value is the maximum cell size assuming a maximum page
** size give above.
*/
#define MX_CELL_SIZE(pBt)  (pBt->pageSize-8)

/* The maximum number of cells on a single page of the database.  This
................................................................................
  u8 inStmt;            /* True if we are in a statement subtransaction */
  u8 readOnly;          /* True if the underlying file is readonly */
  u8 maxEmbedFrac;      /* Maximum payload as % of total page size */
  u8 minEmbedFrac;      /* Minimum payload as % of total page size */
  u8 minLeafFrac;       /* Minimum leaf payload as % of total page size */
  u8 pageSizeFixed;     /* True if the page size can no longer be changed */
  u16 pageSize;         /* Total number of bytes on a page */
  u16 psAligned;        /* pageSize rounded up to a multiple of 8 */
  u16 usableSize;       /* Number of usable bytes on each page */
  int maxLocal;         /* Maximum local payload in non-LEAFDATA tables */
  int minLocal;         /* Minimum local payload in non-LEAFDATA tables */
  int maxLeaf;          /* Maximum local payload in a LEAFDATA table */
  int minLeaf;          /* Minimum local payload in a LEAFDATA table */
};
typedef Btree Bt;
................................................................................
  int cellOffset;
  int nCell, cellLimit;
  u8 *used;

  used = sqliteMallocRaw( pPage->pBt->pageSize );
  if( used==0 ) return;
  usableSize = pPage->pBt->usableSize;
  assert( pPage->aData==&((unsigned char*)pPage)[-pPage->pBt->psAligned] );
  hdr = pPage->hdrOffset;
  assert( hdr==(pPage->pgno==1 ? 100 : 0) );
  assert( pPage->pgno==sqlite3pager_pagenumber(pPage->aData) );
  c = pPage->aData[hdr];
  if( pPage->isInit ){
    assert( pPage->leaf == ((c & PTF_LEAF)!=0) );
    assert( pPage->zeroData == ((c & PTF_ZERODATA)!=0) );
................................................................................
  int nFree;         /* Number of unused bytes on the page */
  int top;           /* First byte of the cell content area */

  pBt = pPage->pBt;
  assert( pBt!=0 );
  assert( pParent==0 || pParent->pBt==pBt );
  assert( pPage->pgno==sqlite3pager_pagenumber(pPage->aData) );
  assert( pPage->aData == &((unsigned char*)pPage)[-pBt->psAligned] );
  if( pPage->pParent!=pParent && (pPage->pParent!=0 || pPage->isInit) ){
    /* The parent page should never change unless the file is corrupt */
    return SQLITE_CORRUPT; /* bkpt-CORRUPT */
  }
  if( pPage->isInit ) return SQLITE_OK;
  if( pPage->pParent==0 && pParent!=0 ){
    pPage->pParent = pParent;
................................................................................
static void zeroPage(MemPage *pPage, int flags){
  unsigned char *data = pPage->aData;
  Btree *pBt = pPage->pBt;
  int hdr = pPage->hdrOffset;
  int first;

  assert( sqlite3pager_pagenumber(data)==pPage->pgno );
  assert( &data[pBt->psAligned] == (unsigned char*)pPage );
  assert( sqlite3pager_iswriteable(data) );
  memset(&data[hdr], 0, pBt->usableSize - hdr);
  data[hdr] = flags;
  first = hdr + 8 + 4*((flags&PTF_LEAF)==0);
  memset(&data[hdr+1], 0, 4);
  data[hdr+7] = 0;
  put2byte(&data[hdr+5], pBt->usableSize);
................................................................................
*/
static int getPage(Btree *pBt, Pgno pgno, MemPage **ppPage){
  int rc;
  unsigned char *aData;
  MemPage *pPage;
  rc = sqlite3pager_get(pBt->pPager, pgno, (void**)&aData);
  if( rc ) return rc;
  pPage = (MemPage*)&aData[pBt->psAligned];
  pPage->aData = aData;
  pPage->pBt = pBt;
  pPage->pgno = pgno;
  pPage->hdrOffset = pPage->pgno==1 ? 100 : 0;
  *ppPage = pPage;
  return SQLITE_OK;
}
................................................................................
** Release a MemPage.  This should be called once for each prior
** call to getPage.
*/
static void releasePage(MemPage *pPage){
  if( pPage ){
    assert( pPage->aData );
    assert( pPage->pBt );
    assert( &pPage->aData[pPage->pBt->psAligned]==(unsigned char*)pPage );
    sqlite3pager_unref(pPage->aData);
  }
}

/*
** This routine is called when the reference count for a page
** reaches zero.  We need to unref the pParent pointer when that
** happens.
*/
static void pageDestructor(void *pData, int pageSize){
  MemPage *pPage = (MemPage*)&((char*)pData)[FORCE_ALIGNMENT(pageSize)];
  if( pPage->pParent ){
    MemPage *pParent = pPage->pParent;
    pPage->pParent = 0;
    releasePage(pParent);
  }
  pPage->isInit = 0;
}
................................................................................
** so that the cache is restored to its original state at the start of
** the transaction, for each page restored this routine is called.
**
** This routine needs to reset the extra data section at the end of the
** page to agree with the restored data.
*/
static void pageReinit(void *pData, int pageSize){
  MemPage *pPage = (MemPage*)&((char*)pData)[FORCE_ALIGNMENT(pageSize)];
  if( pPage->isInit ){
    pPage->isInit = 0;
    initPage(pPage, pPage->pParent);
  }
}

/*
................................................................................
    nReserve = zDbHeader[20];
    pBt->maxEmbedFrac = zDbHeader[21];
    pBt->minEmbedFrac = zDbHeader[22];
    pBt->minLeafFrac = zDbHeader[23];
    pBt->pageSizeFixed = 1;
  }
  pBt->usableSize = pBt->pageSize - nReserve;
  pBt->psAligned = FORCE_ALIGNMENT(pBt->pageSize);
  sqlite3pager_set_pagesize(pBt->pPager, pBt->pageSize);
  *ppBtree = pBt;
  return SQLITE_OK;
}

/*
** Close an open database and invalidate all cursors.
................................................................................
    return SQLITE_READONLY;
  }
  if( nReserve<0 ){
    nReserve = pBt->pageSize - pBt->usableSize;
  }
  if( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE ){
    pBt->pageSize = pageSize;
    pBt->psAligned = FORCE_ALIGNMENT(pageSize);
    sqlite3pager_set_pagesize(pBt->pPager, pageSize);
  }
  pBt->usableSize = pBt->pageSize - nReserve;
  return SQLITE_OK;
}

/*
................................................................................
      goto page1_init_failed;
    }
    pBt->pageSize = get2byte(&page1[16]);
    pBt->usableSize = pBt->pageSize - page1[20];
    if( pBt->usableSize<500 ){
      goto page1_init_failed;
    }
    pBt->psAligned = FORCE_ALIGNMENT(pBt->pageSize);
    pBt->maxEmbedFrac = page1[21];
    pBt->minEmbedFrac = page1[22];
    pBt->minLeafFrac = page1[23];
  }

  /* maxLocal is the maximum amount of payload to store locally for
  ** a cell.  Make sure it is small enough so that at least minFanout
................................................................................
**
** If there is a transaction in progress, this routine is a no-op.
*/
static void unlockBtreeIfUnused(Btree *pBt){
  if( pBt->inTrans==TRANS_NONE && pBt->pCursor==0 && pBt->pPage1!=0 ){
    if( pBt->pPage1->aData==0 ){
      MemPage *pPage = pBt->pPage1;
      pPage->aData = &((char*)pPage)[-pBt->psAligned];
      pPage->pBt = pBt;
      pPage->pgno = 1;
    }
    releasePage(pBt->pPage1);
    pBt->pPage1 = 0;
    pBt->inStmt = 0;
  }
................................................................................
  MemPage *pThis;
  unsigned char *aData;

  if( pgno==0 ) return;
  assert( pBt->pPager!=0 );
  aData = sqlite3pager_lookup(pBt->pPager, pgno);
  if( aData ){
    pThis = (MemPage*)&aData[pBt->psAligned];
    assert( pThis->aData==aData );
    if( pThis->isInit ){
      if( pThis->pParent!=pNewParent ){
        if( pThis->pParent ) sqlite3pager_unref(pThis->pParent->aData);
        pThis->pParent = pNewParent;
        if( pNewParent ) sqlite3pager_ref(pNewParent->aData);
      }
................................................................................
  /*
  ** Allocate space for memory structures
  */
  mxCellPerPage = MX_CELL(pBt);
  apCell = sqliteMallocRaw( 
       (mxCellPerPage+2)*NB*(sizeof(u8*)+sizeof(int))
     + sizeof(MemPage)*NB
     + pBt->psAligned*(5+NB)
  );
  if( apCell==0 ){
    return SQLITE_NOMEM;
  }
  szCell = (int*)&apCell[(mxCellPerPage+2)*NB];
  aCopy[0] = (u8*)&szCell[(mxCellPerPage+2)*NB];
  for(i=1; i<NB; i++){
    aCopy[i] = &aCopy[i-1][pBt->psAligned+sizeof(MemPage)];
  }
  aSpace = &aCopy[NB-1][pBt->psAligned+sizeof(MemPage)];
  
  /*
  ** Find the cell in the parent page whose left child points back
  ** to pPage.  The "idx" variable is the index of that cell.  If pPage
  ** is the rightmost child of pParent then set idx to pParent->nCell 
  */
  if( pParent->idxShift ){
................................................................................
  /*
  ** Make copies of the content of pPage and its siblings into aOld[].
  ** The rest of this function will use data from the copies rather
  ** that the original pages since the original pages will be in the
  ** process of being overwritten.
  */
  for(i=0; i<nOld; i++){
    MemPage *p = apCopy[i] = (MemPage*)&aCopy[i][pBt->psAligned];
    p->aData = &((u8*)p)[-pBt->psAligned];
    memcpy(p->aData, apOld[i]->aData, pBt->psAligned + sizeof(MemPage));
    p->aData = &((u8*)p)[-pBt->psAligned];
  }

  /*
  ** Load pointers to all cells on sibling pages and the divider cells
  ** into the local apCell[] array.  Make copies of the divider cells
  ** into space obtained form aSpace[] and remove the the divider Cells
  ** from pParent.
................................................................................
        */
        dropCell(pParent, nxDiv, sz);
      }else{
        u8 *pTemp;
        szCell[nCell] = sz;
        pTemp = &aSpace[iSpace];
        iSpace += sz;
        assert( iSpace<=pBt->psAligned*5 );
        memcpy(pTemp, apDiv[i], sz);
        apCell[nCell] = pTemp+leafCorrection;
        dropCell(pParent, nxDiv, sz);
        szCell[nCell] -= leafCorrection;
        assert( get4byte(pTemp)==pgnoOld[i] );
        if( !pOld->leaf ){
          assert( leafCorrection==0 );
................................................................................
      }else if( leafData ){
        CellInfo info;
        j--;
        parseCellPtr(pNew, apCell[j], &info);
        pCell = &aSpace[iSpace];
        fillInCell(pParent, pCell, 0, info.nKey, 0, 0, &sz);
        iSpace += sz;
        assert( iSpace<=pBt->psAligned*5 );
        pTemp = 0;
      }else{
        pCell -= 4;
        pTemp = &aSpace[iSpace];
        iSpace += sz;
        assert( iSpace<=pBt->psAligned*5 );
      }
      insertCell(pParent, nxDiv, pCell, sz, pTemp);
      put4byte(findOverflowCell(pParent,nxDiv), pNew->pgno);
      j++;
      nxDiv++;
    }
  }

Changes to src/pager.c.

14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
...
106
107
108
109
110
111
112






113
114
115
116
117
118
119
...
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
...
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
...
210
211
212
213
214
215
216

217
218
219
220
221
222
223
....
1499
1500
1501
1502
1503
1504
1505

1506
1507
1508
1509
1510
1511
1512
....
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
....
1560
1561
1562
1563
1564
1565
1566

1567
1568
1569
1570
1571
1572
1573
....
2129
2130
2131
2132
2133
2134
2135
2136
2137
2138
2139
2140
2141
2142
2143
** The pager is used to access a database disk file.  It implements
** atomic commit and rollback through the use of a journal file that
** is separate from the database file.  The pager also implements file
** locking to prevent two processes from writing the same database
** file simultaneously, or one process from reading the database while
** another is writing.
**
** @(#) $Id: pager.c,v 1.167 2004/10/05 02:41:43 drh Exp $
*/
#include "sqliteInt.h"
#include "os.h"
#include "pager.h"
#include <assert.h>
#include <string.h>

................................................................................
** (if it never invokes its busy callback) then the contention will be
** resolved quickly.
*/
#ifndef SQLITE_BUSY_RESERVED_LOCK
# define SQLITE_BUSY_RESERVED_LOCK 0
#endif







/*
** Each in-memory image of a page begins with the following header.
** This header is only visible to this pager module.  The client
** code that calls pager sees only the data that follows the header.
**
** Client code should call sqlite3pager_write() on a page prior to making
** any modifications to that page.  The first time sqlite3pager_write()
................................................................................
  u8 inJournal;                  /* TRUE if has been written to journal */
  u8 inStmt;                     /* TRUE if in the statement subjournal */
  u8 dirty;                      /* TRUE if we need to write back changes */
  u8 needSync;                   /* Sync journal before writing this page */
  u8 alwaysRollback;             /* Disable dont_rollback() for this page */
  short int nRef;                /* Number of users of this page */
  PgHdr *pDirty;                 /* Dirty pages sorted by PgHdr.pgno */
  /* pPager->pageSize bytes of page data follow this header */
  /* Pager.nExtra bytes of local data follow the page data */
};

/*
** For an in-memory only database, some extra information is recorded about
** each page so that changes can be rolled back.  (Journal files are not
** used for in-memory databases.)  The following information is added to
................................................................................

/*
** Convert a pointer to a PgHdr into a pointer to its data
** and back again.
*/
#define PGHDR_TO_DATA(P)  ((void*)(&(P)[1]))
#define DATA_TO_PGHDR(D)  (&((PgHdr*)(D))[-1])
#define PGHDR_TO_EXTRA(G,P) ((void*)&((char*)(&(G)[1]))[(P)->pageSize])
#define PGHDR_TO_HIST(P,PGR)  \
            ((PgHistory*)&((char*)(&(P)[1]))[(PGR)->pageSize+(PGR)->nExtra])

/*
** How big to make the hash table used for locating in-memory pages
** by page number.
*/
#define N_PG_HASH 2048

................................................................................
  int nRec;                   /* Number of pages written to the journal */
  u32 cksumInit;              /* Quasi-random value added to every checksum */
  int stmtNRec;               /* Number of records in stmt subjournal */
  int nExtra;                 /* Add this many bytes to each in-memory page */
  void (*xDestructor)(void*,int); /* Call this routine when freeing pages */
  void (*xReiniter)(void*,int);   /* Call this routine when reloading pages */
  int pageSize;               /* Number of bytes in a page */

  int nPage;                  /* Total number of in-memory pages */
  int nRef;                   /* Number of in-memory pages with PgHdr.nRef>0 */
  int mxPage;                 /* Maximum number of pages to hold in cache */
  int nHit, nMiss, nOvfl;     /* Cache hits, missing, and LRU overflows */
  void (*xCodec)(void*,void*,Pgno,int); /* Routine for en/decoding data */
  void *pCodecArg;            /* First argument to xCodec() */
  u8 journalOpen;             /* True if journal file descriptors is valid */
................................................................................
  pPager->journalOpen = 0;
  pPager->useJournal = useJournal && !memDb;
  pPager->stmtOpen = 0;
  pPager->stmtInUse = 0;
  pPager->nRef = 0;
  pPager->dbSize = memDb-1;
  pPager->pageSize = SQLITE_DEFAULT_PAGE_SIZE;

  pPager->stmtSize = 0;
  pPager->stmtJSize = 0;
  pPager->nPage = 0;
  pPager->mxPage = 100;
  pPager->state = PAGER_UNLOCK;
  pPager->errMask = 0;
  pPager->tempFile = tempFile;
................................................................................
  pPager->readOnly = readOnly;
  pPager->needSync = 0;
  pPager->noSync = pPager->tempFile || !useJournal;
  pPager->fullSync = (pPager->noSync?0:1);
  pPager->pFirst = 0;
  pPager->pFirstSynced = 0;
  pPager->pLast = 0;
  pPager->nExtra = nExtra;
  pPager->sectorSize = PAGER_SECTOR_SIZE;
  pPager->pBusyHandler = 0;
  memset(pPager->aHash, 0, sizeof(pPager->aHash));
  *ppPager = pPager;
  return SQLITE_OK;
}

................................................................................
** Set the page size.
**
** The page size must only be changed when the cache is empty.
*/
void sqlite3pager_set_pagesize(Pager *pPager, int pageSize){
  assert( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE );
  pPager->pageSize = pageSize;

}

/*
** Read the first N bytes from the beginning of the file into memory
** that pDest points to.  No error checking is done.
*/
void sqlite3pager_read_fileheader(Pager *pPager, int N, unsigned char *pDest){
................................................................................
  }
  if( pPg==0 ){
    /* The requested page is not in the page cache. */
    int h;
    pPager->nMiss++;
    if( pPager->nPage<pPager->mxPage || pPager->pFirst==0 || pPager->memDb ){
      /* Create a new page */
      pPg = sqliteMallocRaw( sizeof(*pPg) + pPager->pageSize 
                              + sizeof(u32) + pPager->nExtra
                              + pPager->memDb*sizeof(PgHistory) );
      if( pPg==0 ){
        if( !pPager->memDb ){
          pager_unwritelock(pPager);
        }
        pPager->errMask |= PAGER_ERR_MEM;







|







 







>
>
>
>
>
>







 







|







 







|

|







 







>







 







>







 







|







 







>







 







|







14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
...
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
...
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
...
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
...
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
....
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
....
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
....
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
....
2138
2139
2140
2141
2142
2143
2144
2145
2146
2147
2148
2149
2150
2151
2152
** The pager is used to access a database disk file.  It implements
** atomic commit and rollback through the use of a journal file that
** is separate from the database file.  The pager also implements file
** locking to prevent two processes from writing the same database
** file simultaneously, or one process from reading the database while
** another is writing.
**
** @(#) $Id: pager.c,v 1.168 2004/10/22 16:22:59 drh Exp $
*/
#include "sqliteInt.h"
#include "os.h"
#include "pager.h"
#include <assert.h>
#include <string.h>

................................................................................
** (if it never invokes its busy callback) then the contention will be
** resolved quickly.
*/
#ifndef SQLITE_BUSY_RESERVED_LOCK
# define SQLITE_BUSY_RESERVED_LOCK 0
#endif

/*
** This macro rounds values up so that if the value is an address it
** is guaranteed to be an address that is aligned to an 8-byte boundary.
*/
#define FORCE_ALIGNMENT(X)   (((X)+7)&~7)

/*
** Each in-memory image of a page begins with the following header.
** This header is only visible to this pager module.  The client
** code that calls pager sees only the data that follows the header.
**
** Client code should call sqlite3pager_write() on a page prior to making
** any modifications to that page.  The first time sqlite3pager_write()
................................................................................
  u8 inJournal;                  /* TRUE if has been written to journal */
  u8 inStmt;                     /* TRUE if in the statement subjournal */
  u8 dirty;                      /* TRUE if we need to write back changes */
  u8 needSync;                   /* Sync journal before writing this page */
  u8 alwaysRollback;             /* Disable dont_rollback() for this page */
  short int nRef;                /* Number of users of this page */
  PgHdr *pDirty;                 /* Dirty pages sorted by PgHdr.pgno */
  /* pPager->psAligned bytes of page data follow this header */
  /* Pager.nExtra bytes of local data follow the page data */
};

/*
** For an in-memory only database, some extra information is recorded about
** each page so that changes can be rolled back.  (Journal files are not
** used for in-memory databases.)  The following information is added to
................................................................................

/*
** Convert a pointer to a PgHdr into a pointer to its data
** and back again.
*/
#define PGHDR_TO_DATA(P)  ((void*)(&(P)[1]))
#define DATA_TO_PGHDR(D)  (&((PgHdr*)(D))[-1])
#define PGHDR_TO_EXTRA(G,P) ((void*)&((char*)(&(G)[1]))[(P)->psAligned])
#define PGHDR_TO_HIST(P,PGR)  \
            ((PgHistory*)&((char*)(&(P)[1]))[(PGR)->psAligned+(PGR)->nExtra])

/*
** How big to make the hash table used for locating in-memory pages
** by page number.
*/
#define N_PG_HASH 2048

................................................................................
  int nRec;                   /* Number of pages written to the journal */
  u32 cksumInit;              /* Quasi-random value added to every checksum */
  int stmtNRec;               /* Number of records in stmt subjournal */
  int nExtra;                 /* Add this many bytes to each in-memory page */
  void (*xDestructor)(void*,int); /* Call this routine when freeing pages */
  void (*xReiniter)(void*,int);   /* Call this routine when reloading pages */
  int pageSize;               /* Number of bytes in a page */
  int psAligned;              /* pageSize rounded up to a multiple of 8 */
  int nPage;                  /* Total number of in-memory pages */
  int nRef;                   /* Number of in-memory pages with PgHdr.nRef>0 */
  int mxPage;                 /* Maximum number of pages to hold in cache */
  int nHit, nMiss, nOvfl;     /* Cache hits, missing, and LRU overflows */
  void (*xCodec)(void*,void*,Pgno,int); /* Routine for en/decoding data */
  void *pCodecArg;            /* First argument to xCodec() */
  u8 journalOpen;             /* True if journal file descriptors is valid */
................................................................................
  pPager->journalOpen = 0;
  pPager->useJournal = useJournal && !memDb;
  pPager->stmtOpen = 0;
  pPager->stmtInUse = 0;
  pPager->nRef = 0;
  pPager->dbSize = memDb-1;
  pPager->pageSize = SQLITE_DEFAULT_PAGE_SIZE;
  pPager->psAligned = FORCE_ALIGNMENT(pPager->pageSize);
  pPager->stmtSize = 0;
  pPager->stmtJSize = 0;
  pPager->nPage = 0;
  pPager->mxPage = 100;
  pPager->state = PAGER_UNLOCK;
  pPager->errMask = 0;
  pPager->tempFile = tempFile;
................................................................................
  pPager->readOnly = readOnly;
  pPager->needSync = 0;
  pPager->noSync = pPager->tempFile || !useJournal;
  pPager->fullSync = (pPager->noSync?0:1);
  pPager->pFirst = 0;
  pPager->pFirstSynced = 0;
  pPager->pLast = 0;
  pPager->nExtra = FORCE_ALIGNMENT(nExtra);
  pPager->sectorSize = PAGER_SECTOR_SIZE;
  pPager->pBusyHandler = 0;
  memset(pPager->aHash, 0, sizeof(pPager->aHash));
  *ppPager = pPager;
  return SQLITE_OK;
}

................................................................................
** Set the page size.
**
** The page size must only be changed when the cache is empty.
*/
void sqlite3pager_set_pagesize(Pager *pPager, int pageSize){
  assert( pageSize>=512 && pageSize<=SQLITE_MAX_PAGE_SIZE );
  pPager->pageSize = pageSize;
  pPager->psAligned = FORCE_ALIGNMENT(pageSize);
}

/*
** Read the first N bytes from the beginning of the file into memory
** that pDest points to.  No error checking is done.
*/
void sqlite3pager_read_fileheader(Pager *pPager, int N, unsigned char *pDest){
................................................................................
  }
  if( pPg==0 ){
    /* The requested page is not in the page cache. */
    int h;
    pPager->nMiss++;
    if( pPager->nPage<pPager->mxPage || pPager->pFirst==0 || pPager->memDb ){
      /* Create a new page */
      pPg = sqliteMallocRaw( sizeof(*pPg) + pPager->psAligned
                              + sizeof(u32) + pPager->nExtra
                              + pPager->memDb*sizeof(PgHistory) );
      if( pPg==0 ){
        if( !pPager->memDb ){
          pager_unwritelock(pPager);
        }
        pPager->errMask |= PAGER_ERR_MEM;

Changes to test/pagesize.test.

7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
..
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#    May you find forgiveness for yourself and forgive others.
#    May you share freely, never taking more than you give.
#
#***********************************************************************
# This file implements regression tests for SQLite library.
# This file implements tests for the page_size PRAGMA.
#
# $Id: pagesize.test,v 1.3 2004/09/05 00:33:44 drh Exp $


set testdir [file dirname $argv0]
source $testdir/tester.tcl

do_test pagesize-1.1 {
  execsql {PRAGMA page_size}
................................................................................
    PRAGMA page_size=65537;
    PRAGMA page_size;
  }
} 8192
  


foreach PGSZ {512 2000 2048 3000 4096} {
  do_test pagesize-2.$PGSZ.1 {
    db close
    file delete -force test.db
    sqlite3 db test.db
    execsql "PRAGMA page_size=$PGSZ"
    execsql {
      CREATE TABLE t1(x);







|







 







|







7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
..
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
#    May you find forgiveness for yourself and forgive others.
#    May you share freely, never taking more than you give.
#
#***********************************************************************
# This file implements regression tests for SQLite library.
# This file implements tests for the page_size PRAGMA.
#
# $Id: pagesize.test,v 1.4 2004/10/22 16:22:59 drh Exp $


set testdir [file dirname $argv0]
source $testdir/tester.tcl

do_test pagesize-1.1 {
  execsql {PRAGMA page_size}
................................................................................
    PRAGMA page_size=65537;
    PRAGMA page_size;
  }
} 8192
  


foreach PGSZ {512 515 516 751 2000 2001 2002 2003 2004 2048 3000 4096} {
  do_test pagesize-2.$PGSZ.1 {
    db close
    file delete -force test.db
    sqlite3 db test.db
    execsql "PRAGMA page_size=$PGSZ"
    execsql {
      CREATE TABLE t1(x);