SQLite

Check-in [ada9a8c7b6]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Update the wal-index hash format so that hash-table space is reused following a rollback, thus preventing hash table overflows. Add assert()s to verify that hash tables do not overfill. Further refactoring of the wal-index code.
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: ada9a8c7b69c5dd2d66bbf62b61181651e6d2142
User & Date: drh 2010-05-18 23:29:53.000
Context
2010-05-19
01:53
Add a large comment to wal.c describing the WAL and wal-index file formats. (check-in: a71a22b52f user: drh tags: trunk)
2010-05-18
23:29
Update the wal-index hash format so that hash-table space is reused following a rollback, thus preventing hash table overflows. Add assert()s to verify that hash tables do not overfill. Further refactoring of the wal-index code. (check-in: ada9a8c7b6 user: drh tags: trunk)
18:01
Refactoring of the WalIterator implementation. (check-in: b5b60fdcc5 user: drh tags: trunk)
Changes
Unified Diff Ignore Whitespace Patch
Changes to src/wal.c.
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24















25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44





45
46





47
48
49

50
51
52


53
54




55
56
57




58
59
60
61
62



63
64
65

66

67




68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
**    May you find forgiveness for yourself and forgive others.
**    May you share freely, never taking more than you give.
**
*************************************************************************
**
** This file contains the implementation of a write-ahead log file used in 
** "journal_mode=wal" mode.
*/
#ifndef SQLITE_OMIT_WAL

#include "wal.h"


/*
** WRITE-AHEAD LOG (WAL) FILE FORMAT
**
** A wal file consists of a header followed by zero or more "frames".















** The file header is 12 bytes in size and consists of the following three
** big-endian 32-bit unsigned integer values:
**
**     0: Database page size,
**     4: Randomly selected salt value 1,
**     8: Randomly selected salt value 2.
**
** Immediately following the header are zero or more frames. Each
** frame itself consists of a 16-byte header followed by a <page-size> bytes
** of page data. The header is broken into 4 big-endian 32-bit unsigned 
** integer values, as follows:
**
**     0: Page number.
**     4: For commit records, the size of the database image in pages 
**        after the commit. For all other records, zero.
**     8: Checksum value 1.
**    12: Checksum value 2.
*/

/* 





** WAL-INDEX FILE FORMAT
**





** The wal-index consists of a header region, followed by an 
** 8-byte region that contains no useful data (used to apply byte-range locks
** in some implementations), followed by the data region. 

**
** The contents of both the header and data region are specified in terms
** of 1, 2 and 4 byte unsigned integers. All integers are stored in 


** machine-endian order.  The wal-index is not a persistent file and
** so it does not need to be portable across archtectures.




**
** A wal-index file is essentially a shadow-pager map. It contains a
** mapping from database page number to the set of locations in the wal




** file that contain versions of the database page. When a database 
** client needs to read a page of data, it first queries the wal-index
** to determine if the required version of the page is stored in
** the wal. If so, the page is read from the wal. If not, the page is
** read from the database file.



**
** Whenever a transaction is appended to the wal or a checkpoint transfers
** data from the wal into the database file, the wal-index is 

** updated accordingly.

*/





/* Object declarations */
typedef struct WalIndexHdr WalIndexHdr;
typedef struct WalIterator WalIterator;


/*
** The following object stores a copy of the wal-index header.
**
** Member variables iCheck1 and iCheck2 contain the checksum for the
** last frame written to the wal, or 2 and 3 respectively if the log 
** is currently empty.
*/
struct WalIndexHdr {
  u32 iChange;                    /* Counter incremented each transaction */
  u32 pgsz;                       /* Database page size in bytes */
  u32 iLastPg;                    /* Address of last valid frame in log */
  u32 nPage;                      /* Size of database in pages */
  u32 iCheck1;                    /* Checkpoint value 1 */
  u32 iCheck2;                    /* Checkpoint value 2 */
};

/* Size of serialized WalIndexHdr object. */
#define WALINDEX_HDR_NFIELD (sizeof(WalIndexHdr) / sizeof(u32))







|
<
<
<
<
<
<



>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
|
















|
|
|
>
>
>
>
>
|

>
>
>
>
>
|
<
<
>
|
<
<
>
>
|
<
>
>
>
>

|
<
>
>
>
>
|
|
|
<
|
>
>
>

<
|
>
|
>

>
>
>
>
















|







8
9
10
11
12
13
14
15






16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66


67
68


69
70
71

72
73
74
75
76
77

78
79
80
81
82
83
84

85
86
87
88
89

90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
**    May you find forgiveness for yourself and forgive others.
**    May you share freely, never taking more than you give.
**
*************************************************************************
**
** This file contains the implementation of a write-ahead log file used in 
** "journal_mode=wal" mode.
**






** WRITE-AHEAD LOG (WAL) FILE FORMAT
**
** A wal file consists of a header followed by zero or more "frames".
** Each frame records the revised content of a single page within the
** database file.  All changes to the database are recorded by writing
** frames into the WAL.  Transactions commit when a frame is written that
** contains a commit marker.  A single WAL can and usually does record 
** multiple transactions.  Periodically, the content of the WAL is
** transferred back into the database file in an operation called a
** "checkpoint".
**
** A single WAL file can be used multiple times.  In other words, the
** WAL can fill up with frames and then be checkpointed.  Then new
** frames can overwrite the old ones.  A WAL always grows from beginning
** toward the end.  Checksums and counters attached to each frame are
** used to determine which frames within the WAL are valid and which
** are leftovers from prior checkpoints.
**
** The WAL header is 12 bytes in size and consists of the following three
** big-endian 32-bit unsigned integer values:
**
**     0: Database page size,
**     4: Randomly selected salt value 1,
**     8: Randomly selected salt value 2.
**
** Immediately following the header are zero or more frames. Each
** frame itself consists of a 16-byte header followed by a <page-size> bytes
** of page data. The header is broken into 4 big-endian 32-bit unsigned 
** integer values, as follows:
**
**     0: Page number.
**     4: For commit records, the size of the database image in pages 
**        after the commit. For all other records, zero.
**     8: Checksum value 1.
**    12: Checksum value 2.
**
** READER ALGORITHM
**
** To read a page from the database (call it page number P), a reader
** first checks the WAL to see if it contains page P.  If so, then the
** last valid instance of page P that is or is followed by a commit frame
** become the value read.  If the WAL contains no copies of page P that
** are valid and which are or are followed by a commit frame, then page
** P is read from the database file.
**
** The reader algorithm in the previous paragraph works correctly, but 
** because frames for page P can appear anywhere within the WAL, the
** reader has to scan the either WAL looking for page P frames.  If the
** WAL is large (multiple megabytes is typical) that scan can be slow,
** and read performanc suffers.  To overcome this problem, a separate
** datastructure called the wal-index is maintained to expedite the


** search for frames of a particular page.
** 


** WAL-INDEX FORMAT
**
** Conceptually, the wal-index is shared memory, though VFS implementations

** might choose to implement the wal-index using a mmapped file.  Because
** the wal-index is shared memory, SQLite does not support journal_mode=WAL 
** on a network filesystem.  All users of the database must be able to
** share memory.
**
** The wal-index is transient.  After a crash, the wal-index can (and should

** be) reconstructed from the original WAL file.  In fact, the VFS is required
** to either truncate or zero the header of the wal-index when the last
** connection to it closes.  Because the wal-index is transient, it can
** use an architecture-specific format; it does not have to be cross-platform.
** Hence, unlike the database and WAL file formats which store all values
** as big endian, the wal-index can store multi-byte values in the native
** byte order of the host computer.

**
** The purpose of the wal-index is to answer this question quickly:  Given
** a page number P, return the index of the last frame for page P in the WAL,
** or return NULL if there are no frames for page P in the WAL.
**

** The wal-index consists of a header region, followed by an one or
** more index blocks.  
**
** To be completed....
*/
#ifndef SQLITE_OMIT_WAL

#include "wal.h"


/* Object declarations */
typedef struct WalIndexHdr WalIndexHdr;
typedef struct WalIterator WalIterator;


/*
** The following object stores a copy of the wal-index header.
**
** Member variables iCheck1 and iCheck2 contain the checksum for the
** last frame written to the wal, or 2 and 3 respectively if the log 
** is currently empty.
*/
struct WalIndexHdr {
  u32 iChange;                    /* Counter incremented each transaction */
  u32 pgsz;                       /* Database page size in bytes */
  u32 iLastPg;                    /* Index of last valid frame in the WAL */
  u32 nPage;                      /* Size of database in pages */
  u32 iCheck1;                    /* Checkpoint value 1 */
  u32 iCheck2;                    /* Checkpoint value 2 */
};

/* Size of serialized WalIndexHdr object. */
#define WALINDEX_HDR_NFIELD (sizeof(WalIndexHdr) / sizeof(u32))
301
302
303
304
305
306
307
308
309
310



311
312
313
314
315
316
317
318
319
320
321
322
323

  *piPage = sqlite3Get4byte(&aFrame[0]);
  *pnTruncate = sqlite3Get4byte(&aFrame[4]);
  return 1;
}

/*
** Define the size of the hash tables in the wal-index file. There
** is a hash-table following every HASHTABLE_NPAGE page numbers in the
** wal-index.



*/
#define HASHTABLE_NPAGE      4096
#define HASHTABLE_DATATYPE   u16

#define HASHTABLE_NSLOT     (HASHTABLE_NPAGE*2)
#define HASHTABLE_NBYTE     (sizeof(HASHTABLE_DATATYPE)*HASHTABLE_NSLOT)

/*
** Return the index in the Wal.pWiData array that corresponds to 
** frame iFrame.
**
** Wal.pWiData is an array of u32 elements that is the wal-index.
** The array begins with a header and is then followed by alternating







|


>
>
>

|

|
|
|







332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357

  *piPage = sqlite3Get4byte(&aFrame[0]);
  *pnTruncate = sqlite3Get4byte(&aFrame[4]);
  return 1;
}

/*
** Define the parameters of the hash tables in the wal-index file. There
** is a hash-table following every HASHTABLE_NPAGE page numbers in the
** wal-index.
**
** Changing any of these constants will alter the wal-index format and
** create incompatibilities.
*/
#define HASHTABLE_NPAGE      4096  /* Must be power of 2 and multiple of 256 */
#define HASHTABLE_DATATYPE   u16
#define HASHTABLE_HASH_1     383                  /* Should be prime */
#define HASHTABLE_NSLOT      (HASHTABLE_NPAGE*2)  /* Must be a power of 2 */
#define HASHTABLE_NBYTE      (sizeof(HASHTABLE_DATATYPE)*HASHTABLE_NSLOT)

/*
** Return the index in the Wal.pWiData array that corresponds to 
** frame iFrame.
**
** Wal.pWiData is an array of u32 elements that is the wal-index.
** The array begins with a header and is then followed by alternating
406
407
408
409
410
411
412





413





414
415
416
417
418
419
420
421
}

/*
** Increment by which to increase the wal-index file size.
*/
#define WALINDEX_MMAP_INCREMENT (64*1024)






static int walHashKey(u32 iPage){





  return (iPage*2) % (HASHTABLE_NSLOT-1);
}


/* 
** Find the hash table and (section of the) page number array used to
** store data for WAL frame iFrame.
**







>
>
>
>
>
|
>
>
>
>
>
|







440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
}

/*
** Increment by which to increase the wal-index file size.
*/
#define WALINDEX_MMAP_INCREMENT (64*1024)


/*
** Compute a hash on a page number.  The resulting hash value must land
** between 0 and (HASHTABLE_NSLOT-1).
*/
static int walHash(u32 iPage){
  assert( iPage>0 );
  assert( (HASHTABLE_NSLOT & (HASHTABLE_NSLOT-1))==0 );
  return (iPage*HASHTABLE_HASH_1) & (HASHTABLE_NSLOT-1);
}
static int walNextHash(int iPriorHash){
  return (iPriorHash+1)&(HASHTABLE_NSLOT-1);
}


/* 
** Find the hash table and (section of the) page number array used to
** store data for WAL frame iFrame.
**
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
  *paHash = aHash;
  *paPgno = aPgno;
  *piZero = iZero;
}


/*
** Set an entry in the wal-index map to map log frame iFrame to db 
** page iPage. Values are always appended to the wal-index (i.e. the
** value of iFrame is always exactly one more than the value passed to
** the previous call), but that restriction is not enforced or asserted
** here.
*/
static int walIndexAppend(Wal *pWal, u32 iFrame, u32 iPage){
  int rc;                         /* Return code */
  int nMapping;                   /* Required mapping size in bytes */
  
  /* Make sure the wal-index is mapped. Enlarge the mapping if required. */
  nMapping = walMappingSize(iFrame);







|
<
<
<
|







501
502
503
504
505
506
507
508



509
510
511
512
513
514
515
516
  *paHash = aHash;
  *paPgno = aPgno;
  *piZero = iZero;
}


/*
** Set an entry in the wal-index that will map database page number



** pPage into WAL frame iFrame.
*/
static int walIndexAppend(Wal *pWal, u32 iFrame, u32 iPage){
  int rc;                         /* Return code */
  int nMapping;                   /* Required mapping size in bytes */
  
  /* Make sure the wal-index is mapped. Enlarge the mapping if required. */
  nMapping = walMappingSize(iFrame);
486
487
488
489
490
491
492

493
494
495
496

497
498


499
500
501
502
503
504
505
  */
  if( rc==SQLITE_OK ){
    int iKey;                     /* Hash table key */
    u32 iZero;                    /* One less than frame number of aPgno[1] */
    volatile u32 *aPgno;                 /* Page number array */
    volatile HASHTABLE_DATATYPE *aHash;  /* Hash table */
    int idx;                             /* Value to write to hash-table slot */


    walHashFind(pWal, iFrame, &aHash, &aPgno, &iZero);
    idx = iFrame - iZero;
    if( idx==1 ) memset((void*)aHash, 0, HASHTABLE_NBYTE);

    aPgno[iFrame] = iPage;
    for(iKey=walHashKey(iPage); aHash[iKey]; iKey=(iKey+1)%HASHTABLE_NSLOT);


    aHash[iKey] = idx;
  }

  return rc;
}









>



|
>

|
>
>







527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
  */
  if( rc==SQLITE_OK ){
    int iKey;                     /* Hash table key */
    u32 iZero;                    /* One less than frame number of aPgno[1] */
    volatile u32 *aPgno;                 /* Page number array */
    volatile HASHTABLE_DATATYPE *aHash;  /* Hash table */
    int idx;                             /* Value to write to hash-table slot */
    TESTONLY( int nCollide = 0;          /* Number of hash collisions */ )

    walHashFind(pWal, iFrame, &aHash, &aPgno, &iZero);
    idx = iFrame - iZero;
    if( idx==1 ) memset((void*)aHash, 0xff, HASHTABLE_NBYTE);
    assert( idx <= HASHTABLE_NSLOT/2 + 1 );
    aPgno[iFrame] = iPage;
    for(iKey=walHash(iPage); aHash[iKey]<idx; iKey=walNextHash(iKey)){
      assert( nCollide++ < idx );
    }
    aHash[iKey] = idx;
  }

  return rc;
}


1229
1230
1231
1232
1233
1234
1235

1236
1237


1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
  ** can occur. But if it does, it should not cause any problems.
  */
  for(iHash=iLast; iHash>0 && iRead==0; iHash-=HASHTABLE_NPAGE){
    volatile HASHTABLE_DATATYPE *aHash;  /* Pointer to hash table */
    volatile u32 *aPgno;                 /* Pointer to array of page numbers */
    u32 iZero;                    /* Frame number corresponding to aPgno[0] */
    int iKey;                     /* Hash slot index */


    walHashFind(pWal, iHash, &aHash, &aPgno, &iZero);


    for(iKey=walHashKey(pgno); aHash[iKey]; iKey=(iKey+1)%HASHTABLE_NSLOT){
      u32 iFrame = aHash[iKey] + iZero;
      if( iFrame<=iLast && aPgno[iFrame]==pgno && iFrame>iRead ){
        iRead = iFrame;
      }
    }
  }
  assert( iRead==0 || pWal->pWiData[walIndexEntry(iRead)]==pgno );

#ifdef SQLITE_ENABLE_EXPENSIVE_ASSERT







>


>
>
|

|







1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
  ** can occur. But if it does, it should not cause any problems.
  */
  for(iHash=iLast; iHash>0 && iRead==0; iHash-=HASHTABLE_NPAGE){
    volatile HASHTABLE_DATATYPE *aHash;  /* Pointer to hash table */
    volatile u32 *aPgno;                 /* Pointer to array of page numbers */
    u32 iZero;                    /* Frame number corresponding to aPgno[0] */
    int iKey;                     /* Hash slot index */
    int mxHash;                   /* upper bound on aHash[] values */

    walHashFind(pWal, iHash, &aHash, &aPgno, &iZero);
    mxHash = iLast - iZero;
    if( mxHash > HASHTABLE_NPAGE )  mxHash = HASHTABLE_NPAGE;
    for(iKey=walHash(pgno); aHash[iKey]<=mxHash; iKey=walNextHash(iKey)){
      u32 iFrame = aHash[iKey] + iZero;
      if( ALWAYS(iFrame<=iLast) && aPgno[iFrame]==pgno && iFrame>iRead ){
        iRead = iFrame;
      }
    }
  }
  assert( iRead==0 || pWal->pWiData[walIndexEntry(iRead)]==pgno );

#ifdef SQLITE_ENABLE_EXPENSIVE_ASSERT