/ Check-in [ed77556a]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:If a writer exits unexpectedly in the middle of a transaction, have the following writer remove any wal-index hash-table entries left by the interrupted transaction.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: ed77556adcdf7011b95b9969b360269fb2ebe4e5
User & Date: dan 2010-05-25 10:50:57
Context
2010-05-25
13:40
Update header comments in wal.c to correctly describe the WAL file format. Update the locking region offsets in os_unix.c and os_win.c and add assert() statement to verify that the locking region offsets are correct. check-in: 40030c07 user: drh tags: trunk
10:50
If a writer exits unexpectedly in the middle of a transaction, have the following writer remove any wal-index hash-table entries left by the interrupted transaction. check-in: ed77556a user: dan tags: trunk
02:24
Remove unreachable code associated with WAL from the pager. check-in: 54c1718e user: drh tags: trunk
Changes
Hide Diffs Unified Diffs Ignore Whitespace Patch

Changes to src/wal.c.

656
657
658
659
660
661
662






















































663
664
665
666
667
668
669
...
688
689
690
691
692
693
694


695

696











697
698
699
700
701
702
703
....
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
....
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627

1628
1629
1630
1631
1632
1633
1634
  assert( walIndexEntry(iZero+1)==(&aPgno[iZero+1] - pWal->pWiData) );

  *paHash = aHash;
  *paPgno = aPgno;
  *piZero = iZero;
}
























































/*
** Set an entry in the wal-index that will map database page number
** pPage into WAL frame iFrame.
*/
static int walIndexAppend(Wal *pWal, u32 iFrame, u32 iPage){
  int rc;                         /* Return code */
................................................................................
    volatile u32 *aPgno;                 /* Page number array */
    volatile HASHTABLE_DATATYPE *aHash;  /* Hash table */
    int idx;                             /* Value to write to hash-table slot */
    TESTONLY( int nCollide = 0;          /* Number of hash collisions */ )

    walHashFind(pWal, iFrame, &aHash, &aPgno, &iZero);
    idx = iFrame - iZero;


    if( idx==1 ) memset((void*)aHash, 0, HASHTABLE_NBYTE);

    assert( idx <= HASHTABLE_NSLOT/2 + 1 );











    aPgno[iFrame] = iPage;
    for(iKey=walHash(iPage); aHash[iKey]; iKey=walNextHash(iKey)){
      assert( nCollide++ < idx );
    }
    aHash[iKey] = idx;

#ifdef SQLITE_ENABLE_EXPENSIVE_ASSERT
................................................................................
    }
  }else if( pWal->lockState==SQLITE_SHM_WRITE ){
    rc = walSetLock(pWal, SQLITE_SHM_READ);
  }
  return rc;
}

/*
** Remove entries from the hash table that point to WAL slots greater
** than pWal->hdr.mxFrame.
**
** This function is called whenever pWal->hdr.mxFrame is decreased due
** to a rollback or savepoint.
**
** At most only the very last hash table needs to be updated.  Any
** later hash tables will be automatically cleared when pWal->hdr.mxFrame
** advances to the point where those hash tables are actually needed.
*/
static void walCleanupHash(Wal *pWal){
  volatile HASHTABLE_DATATYPE *aHash;  /* Pointer to hash table to clear */
  volatile u32 *aPgno;                 /* Unused return from walHashFind() */
  u32 iZero;                           /* frame == (aHash[x]+iZero) */
  int iLimit;                          /* Zero values greater than this */

  assert( pWal->lockState==SQLITE_SHM_WRITE );
  walHashFind(pWal, pWal->hdr.mxFrame+1, &aHash, &aPgno, &iZero);
  iLimit = pWal->hdr.mxFrame - iZero;
  if( iLimit>0 ){
    int i;                      /* Used to iterate through aHash[] */
    for(i=0; i<HASHTABLE_NSLOT; i++){
      if( aHash[i]>iLimit ){
        aHash[i] = 0;
      }
    }
  }

#ifdef SQLITE_ENABLE_EXPENSIVE_ASSERT
  /* Verify that the every entry in the mapping region is still reachable
  ** via the hash table even after the cleanup.
  */
  {
    int i;           /* Loop counter */
    int iKey;        /* Hash key */
    for(i=1; i<=iLimit; i++){
      for(iKey=walHash(aPgno[i+iZero]); aHash[iKey]; iKey=walNextHash(iKey)){
        if( aHash[iKey]==i ) break;
      }
      assert( aHash[iKey]==i );
    }
  }
#endif /* SQLITE_ENABLE_EXPENSIVE_ASSERT */
}

/*
** If any data has been written (but not committed) to the log file, this
** function moves the write-pointer back to the start of the transaction.
**
** Additionally, the callback function is invoked for each frame written
** to the log since the start of the transaction. If the callback returns
** other than SQLITE_OK, it is not invoked again and the error code is
................................................................................
    int unused;
    Pgno iMax = pWal->hdr.mxFrame;
    Pgno iFrame;
  
    assert( pWal->pWiData==0 );
    rc = walIndexReadHdr(pWal, &unused);
    if( rc==SQLITE_OK ){
      walCleanupHash(pWal);
      for(iFrame=pWal->hdr.mxFrame+1; rc==SQLITE_OK && iFrame<=iMax; iFrame++){
        assert( pWal->lockState==SQLITE_SHM_WRITE );
        rc = xUndo(pUndoCtx, pWal->pWiData[walIndexEntry(iFrame)]);
      }

    }
    walIndexUnmap(pWal);
  }
  return rc;
}

/* 







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







 







>
>
|
>

>
>
>
>
>
>
>
>
>
>
>







 







<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<







 







<




>







656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
...
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
....
1616
1617
1618
1619
1620
1621
1622














































1623
1624
1625
1626
1627
1628
1629
....
1638
1639
1640
1641
1642
1643
1644

1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
  assert( walIndexEntry(iZero+1)==(&aPgno[iZero+1] - pWal->pWiData) );

  *paHash = aHash;
  *paPgno = aPgno;
  *piZero = iZero;
}

/*
** Remove entries from the hash table that point to WAL slots greater
** than pWal->hdr.mxFrame.
**
** This function is called whenever pWal->hdr.mxFrame is decreased due
** to a rollback or savepoint.
**
** At most only the very last hash table needs to be updated.  Any
** later hash tables will be automatically cleared when pWal->hdr.mxFrame
** advances to the point where those hash tables are actually needed.
*/
static void walCleanupHash(Wal *pWal){
  volatile HASHTABLE_DATATYPE *aHash;  /* Pointer to hash table to clear */
  volatile u32 *aPgno;                 /* Unused return from walHashFind() */
  u32 iZero;                           /* frame == (aHash[x]+iZero) */
  int iLimit;                          /* Zero values greater than this */

  assert( pWal->lockState==SQLITE_SHM_WRITE );
  walHashFind(pWal, pWal->hdr.mxFrame+1, &aHash, &aPgno, &iZero);
  iLimit = pWal->hdr.mxFrame - iZero;
  if( iLimit>0 ){
    int nByte;                    /* Number of bytes to zero in aPgno[] */
    int i;                        /* Used to iterate through aHash[] */
    for(i=0; i<HASHTABLE_NSLOT; i++){
      if( aHash[i]>iLimit ){
        aHash[i] = 0;
      }
    }

    /* Zero the entries in the aPgno array that correspond to frames with
    ** frame numbers greater than pWal->hdr.mxFrame. 
    */
    nByte = sizeof(u32) * (HASHTABLE_NPAGE-iLimit);
    memset((void *)&aPgno[iZero+iLimit+1], 0, nByte);
    assert( &((u8 *)&aPgno[iZero+iLimit+1])[nByte]==(u8 *)aHash );
  }

#ifdef SQLITE_ENABLE_EXPENSIVE_ASSERT
  /* Verify that the every entry in the mapping region is still reachable
  ** via the hash table even after the cleanup.
  */
  {
    int i;           /* Loop counter */
    int iKey;        /* Hash key */
    for(i=1; i<=iLimit; i++){
      for(iKey=walHash(aPgno[i+iZero]); aHash[iKey]; iKey=walNextHash(iKey)){
        if( aHash[iKey]==i ) break;
      }
      assert( aHash[iKey]==i );
    }
  }
#endif /* SQLITE_ENABLE_EXPENSIVE_ASSERT */
}


/*
** Set an entry in the wal-index that will map database page number
** pPage into WAL frame iFrame.
*/
static int walIndexAppend(Wal *pWal, u32 iFrame, u32 iPage){
  int rc;                         /* Return code */
................................................................................
    volatile u32 *aPgno;                 /* Page number array */
    volatile HASHTABLE_DATATYPE *aHash;  /* Hash table */
    int idx;                             /* Value to write to hash-table slot */
    TESTONLY( int nCollide = 0;          /* Number of hash collisions */ )

    walHashFind(pWal, iFrame, &aHash, &aPgno, &iZero);
    idx = iFrame - iZero;
    if( idx==1 ){
      memset((void*)&aPgno[iZero+1], 0, HASHTABLE_NPAGE*sizeof(u32));
      memset((void*)aHash, 0, HASHTABLE_NBYTE);
    }
    assert( idx <= HASHTABLE_NSLOT/2 + 1 );

    if( aPgno[iFrame] ){
      /* If the entry in aPgno[] is already set, then the previous writer
      ** must have exited unexpectedly in the middle of a transaction (after
      ** writing one or more dirty pages to the WAL to free up memory). 
      ** Remove the remnants of that writers uncommitted transaction from 
      ** the hash-table before writing any new entries.
      */
      walCleanupHash(pWal);
      assert( !aPgno[iFrame] );
    }
    aPgno[iFrame] = iPage;
    for(iKey=walHash(iPage); aHash[iKey]; iKey=walNextHash(iKey)){
      assert( nCollide++ < idx );
    }
    aHash[iKey] = idx;

#ifdef SQLITE_ENABLE_EXPENSIVE_ASSERT
................................................................................
    }
  }else if( pWal->lockState==SQLITE_SHM_WRITE ){
    rc = walSetLock(pWal, SQLITE_SHM_READ);
  }
  return rc;
}















































/*
** If any data has been written (but not committed) to the log file, this
** function moves the write-pointer back to the start of the transaction.
**
** Additionally, the callback function is invoked for each frame written
** to the log since the start of the transaction. If the callback returns
** other than SQLITE_OK, it is not invoked again and the error code is
................................................................................
    int unused;
    Pgno iMax = pWal->hdr.mxFrame;
    Pgno iFrame;
  
    assert( pWal->pWiData==0 );
    rc = walIndexReadHdr(pWal, &unused);
    if( rc==SQLITE_OK ){

      for(iFrame=pWal->hdr.mxFrame+1; rc==SQLITE_OK && iFrame<=iMax; iFrame++){
        assert( pWal->lockState==SQLITE_SHM_WRITE );
        rc = xUndo(pUndoCtx, pWal->pWiData[walIndexEntry(iFrame)]);
      }
      walCleanupHash(pWal);
    }
    walIndexUnmap(pWal);
  }
  return rc;
}

/* 

Added test/walcrash2.test.









































































































































































































>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
# 2010 May 25
#
# The author disclaims copyright to this source code.  In place of
# a legal notice, here is a blessing:
#
#    May you do good and not evil.
#    May you find forgiveness for yourself and forgive others.
#    May you share freely, never taking more than you give.
#
#***********************************************************************
#


set testdir [file dirname $argv0]
source $testdir/tester.tcl
source $testdir/lock_common.tcl
ifcapable !wal {finish_test ; return }

proc wal_file_size {nFrame pgsz} { 
  expr {24 + ($pgsz+24)*$nFrame} 
}

#-------------------------------------------------------------------------
# This test case demonstrates a flaw in the wal-index manipulation that
# existed at one point: If a process crashes mid-transaction, it may have
# already added some entries to one of the hash-tables in the wal-index.
# If the transaction were to be explicitly rolled back at this point, the
# hash-table entries would be removed as part of the rollback. However,
# if the process crashes, the transaction is implicitly rolled back and
# the rogue entries remain in the hash table.
#
# Normally, this causes no problem - readers can tell the difference 
# between committed and uncommitted entries in the hash table. However,
# if it happens often enough that all slots in the hash-table become 
# non-zero, the next process that attempts to read or write the hash
# table falls into an infinite loop.
#
# Even if run with an SQLite version affected by the bug, this test case
# only goes into an infinite loop if SQLite is compiled without SQLITE_DEBUG
# defined. If SQLITE_DEBUG is defined, the program is halted by a failing
# assert() before entering the infinite loop.
#
# walcrash2-1.1: Create a database. Commit a transaction that adds 8 frames
#                to the WAL (and 8 entry to the first hash-table in the 
#                wal-index).
#
# walcrash2-1.2: Have an external process open a transaction, add 8 entries
#                to the wal-index hash-table, then crash. Repeat this 1023
#                times (so that the wal-index contains 8192 entries - all
#                slots are non-zero).
#
# walcrash2-1.3: Using a new database connection, attempt to query the 
#                database. This should cause the process to go into the
#                infinite loop.
#
do_test walcrash2-1.1 {
  execsql {
    PRAGMA page_size = 1024;
    PRAGMA journal_mode = WAL;
    PRAGMA synchronous = NORMAL;
    BEGIN;
      CREATE TABLE t1(x);
      CREATE TABLE t2(x);
      CREATE TABLE t3(x);
      CREATE TABLE t4(x);
      CREATE TABLE t5(x);
      CREATE TABLE t6(x);
      CREATE TABLE t7(x);
    COMMIT;
  }
  file size test.db-wal
} [wal_file_size 8 1024] 
for {set nEntry 8} {$nEntry < 8192} {incr nEntry 8} {
  do_test walcrash2-1.2.[expr $nEntry/8] {
    set C [launch_testfixture]
    testfixture $C {
      sqlite3 db test.db
      db eval {
        PRAGMA cache_size = 15;
        BEGIN;
          INSERT INTO t1 VALUES(randomblob(900));         --  1 row,  1  page
          INSERT INTO t1 SELECT * FROM t1;                --  2 rows, 3  pages
          INSERT INTO t1 SELECT * FROM t1;                --  4 rows, 5  pages
          INSERT INTO t1 SELECT * FROM t1;                --  8 rows, 9  pages
          INSERT INTO t1 SELECT * FROM t1;                -- 16 rows, 17 pages
          INSERT INTO t1 SELECT * FROM t1 LIMIT 3;        -- 20 rows, 20 pages
      }
    } 
    close $C
    file size test.db-wal
  } [wal_file_size 16 1024]
}
do_test walcrash2-1.3 {
  sqlite3 db2 test.db
  execsql { SELECT count(*) FROM t1 } db2
} {0}
catch { db2 close }

finish_test