SQLite

Check-in [67d2a89ec2]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Fix bug in log recovery (last frame in log was being ignored). Also remove an incorrect assert statement.
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | wal
Files: files | file ages | folders
SHA1: 67d2a89ec2d593a077eb19a6ea2b06cb1c2e9ba8
User & Date: dan 2010-04-16 11:30:18.000
Context
2010-04-16
13:59
Change the log file format to include a small (12 byte) header at the start of the file. (check-in: 9865d14d60 user: dan tags: wal)
11:30
Fix bug in log recovery (last frame in log was being ignored). Also remove an incorrect assert statement. (check-in: 67d2a89ec2 user: dan tags: wal)
2010-04-15
16:45
Allow writers to write dirty pages to the log mid-transaction in order to free memory. (check-in: ecd828f969 user: dan tags: wal)
Changes
Unified Diff Ignore Whitespace Patch
Changes to src/log.c.
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22

#include <unistd.h>
#include <fcntl.h>
#include <sys/mman.h>

typedef struct LogSummaryHdr LogSummaryHdr;
typedef struct LogSummary LogSummary;
typedef struct LogCheckpoint LogCheckpoint;
typedef struct LogLock LogLock;


/*
** The following structure may be used to store the same data that
** is stored in the log-summary header.
**







|







8
9
10
11
12
13
14
15
16
17
18
19
20
21
22

#include <unistd.h>
#include <fcntl.h>
#include <sys/mman.h>

typedef struct LogSummaryHdr LogSummaryHdr;
typedef struct LogSummary LogSummary;
typedef struct LogIterator LogIterator;
typedef struct LogLock LogLock;


/*
** The following structure may be used to store the same data that
** is stored in the log-summary header.
**
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
** This structure is used to implement an iterator that iterates through
** all frames in the log in database page order. Where two or more frames
** correspond to the same database page, the iterator visits only the 
** frame most recently written to the log.
**
** The internals of this structure are only accessed by:
**
**   logCheckpointInit() - Create a new iterator,
**   logCheckpointNext() - Step an iterator,
**   logCheckpointFree() - Free an iterator.
**
** This functionality is used by the checkpoint code (see logCheckpoint()).
*/
struct LogCheckpoint {
  int nSegment;                   /* Size of LogCheckpoint.aSummary[] array */
  int nFinal;                     /* Elements in segment nSegment-1 */
  struct LogSegment {
    int iNext;                    /* Next aIndex index */
    u8 *aIndex;                   /* Pointer to index array */
    u32 *aDbPage;                 /* Pointer to db page array */
  } aSegment[1];
};







|
|
|



|
|







105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
** This structure is used to implement an iterator that iterates through
** all frames in the log in database page order. Where two or more frames
** correspond to the same database page, the iterator visits only the 
** frame most recently written to the log.
**
** The internals of this structure are only accessed by:
**
**   logIteratorInit() - Create a new iterator,
**   logIteratorNext() - Step an iterator,
**   logIteratorFree() - Free an iterator.
**
** This functionality is used by the checkpoint code (see logCheckpoint()).
*/
struct LogIterator {
  int nSegment;                   /* Size of LogIterator.aSegment[] array */
  int nFinal;                     /* Elements in segment nSegment-1 */
  struct LogSegment {
    int iNext;                    /* Next aIndex index */
    u8 *aIndex;                   /* Pointer to index array */
    u32 *aDbPage;                 /* Pointer to db page array */
  } aSegment[1];
};
342
343
344
345
346
347
348


349
350
351
352
353
354
355
  u32 *aCksum,                    /* IN/OUT: Checksum values */
  u32 *piPage,                    /* OUT: Database page number for frame */
  u32 *pnTruncate,                /* OUT: New db size (or 0 if not commit) */
  int nData,                      /* Database page size (size of aData[]) */
  u8 *aData,                      /* Pointer to page data (for checksum) */
  u8 *aFrame                      /* Frame data */
){


  logChecksumBytes(aFrame, 12, aCksum);
  logChecksumBytes(aData, nData, aCksum);

  if( aCksum[0]!=sqlite3Get4byte(&aFrame[12]) 
   || aCksum[1]!=sqlite3Get4byte(&aFrame[16]) 
  ){
    /* Checksum failed. */







>
>







342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
  u32 *aCksum,                    /* IN/OUT: Checksum values */
  u32 *piPage,                    /* OUT: Database page number for frame */
  u32 *pnTruncate,                /* OUT: New db size (or 0 if not commit) */
  int nData,                      /* Database page size (size of aData[]) */
  u8 *aData,                      /* Pointer to page data (for checksum) */
  u8 *aFrame                      /* Frame data */
){
  assert( LOG_FRAME_HDRSIZE==20 );

  logChecksumBytes(aFrame, 12, aCksum);
  logChecksumBytes(aData, nData, aCksum);

  if( aCksum[0]!=sqlite3Get4byte(&aFrame[12]) 
   || aCksum[1]!=sqlite3Get4byte(&aFrame[16]) 
  ){
    /* Checksum failed. */
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
      return SQLITE_NOMEM;
    }
    aData = &aFrame[LOG_FRAME_HDRSIZE];

    /* Read all frames from the log file. */
    iFrame = 0;
    iOffset = 0;
    for(iOffset=0; (iOffset+nFrame)<nSize; iOffset+=nFrame){
      u32 pgno;                   /* Database page number for frame */
      u32 nTruncate;              /* dbsize field from frame header */
      int isValid;                /* True if this frame is valid */

      /* Read and decode the next log frame. */
      rc = sqlite3OsRead(pFd, aFrame, nFrame, iOffset);
      if( rc!=SQLITE_OK ) break;







|







511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
      return SQLITE_NOMEM;
    }
    aData = &aFrame[LOG_FRAME_HDRSIZE];

    /* Read all frames from the log file. */
    iFrame = 0;
    iOffset = 0;
    for(iOffset=0; (iOffset+nFrame)<=nSize; iOffset+=nFrame){
      u32 pgno;                   /* Database page number for frame */
      u32 nTruncate;              /* dbsize field from frame header */
      int isValid;                /* True if this frame is valid */

      /* Read and decode the next log frame. */
      rc = sqlite3OsRead(pFd, aFrame, nFrame, iOffset);
      if( rc!=SQLITE_OK ) break;
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
    assert( !pSummary || pSummary->nRef==0 );
    sqlite3_free(pSummary);
  }
  *ppLog = pRet;
  return rc;
}

static int logCheckpointNext(
  LogCheckpoint *p,               /* Iterator */
  u32 *piPage,                    /* OUT: Next db page to write */
  u32 *piFrame                    /* OUT: Log frame to read from */
){
  u32 iMin = *piPage;
  u32 iRet = 0xFFFFFFFF;
  int i;
  int nBlock = p->nFinal;







|
|







711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
    assert( !pSummary || pSummary->nRef==0 );
    sqlite3_free(pSummary);
  }
  *ppLog = pRet;
  return rc;
}

static int logIteratorNext(
  LogIterator *p,               /* Iterator */
  u32 *piPage,                    /* OUT: Next db page to write */
  u32 *piFrame                    /* OUT: Log frame to read from */
){
  u32 iMin = *piPage;
  u32 iRet = 0xFFFFFFFF;
  int i;
  int nBlock = p->nFinal;
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
    nBlock = 256;
  }

  *piPage = iRet;
  return (iRet==0xFFFFFFFF);
}

static LogCheckpoint *logCheckpointInit(Log *pLog){
  u32 *aData = pLog->pSummary->aData;
  LogCheckpoint *p;               /* Return value */
  int nSegment;                   /* Number of segments to merge */
  u32 iLast;                      /* Last frame in log */
  int nByte;                      /* Number of bytes to allocate */
  int i;                          /* Iterator variable */
  int nFinal;                     /* Number of unindexed entries */
  struct LogSegment *pFinal;      /* Final (unindexed) segment */
  u8 *aTmp;                       /* Temp space used by merge-sort */

  iLast = pLog->hdr.iLastPg;
  nSegment = (iLast >> 8) + 1;
  nFinal = (iLast & 0x000000FF);

  nByte = sizeof(LogCheckpoint) + (nSegment-1)*sizeof(struct LogSegment) + 512;
  p = (LogCheckpoint *)sqlite3_malloc(nByte);
  if( p ){
    memset(p, 0, nByte);
    p->nSegment = nSegment;
    p->nFinal = nFinal;
  }

  for(i=0; i<nSegment-1; i++){







|

|












|
|







742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
    nBlock = 256;
  }

  *piPage = iRet;
  return (iRet==0xFFFFFFFF);
}

static LogIterator *logIteratorInit(Log *pLog){
  u32 *aData = pLog->pSummary->aData;
  LogIterator *p;                 /* Return value */
  int nSegment;                   /* Number of segments to merge */
  u32 iLast;                      /* Last frame in log */
  int nByte;                      /* Number of bytes to allocate */
  int i;                          /* Iterator variable */
  int nFinal;                     /* Number of unindexed entries */
  struct LogSegment *pFinal;      /* Final (unindexed) segment */
  u8 *aTmp;                       /* Temp space used by merge-sort */

  iLast = pLog->hdr.iLastPg;
  nSegment = (iLast >> 8) + 1;
  nFinal = (iLast & 0x000000FF);

  nByte = sizeof(LogIterator) + (nSegment-1)*sizeof(struct LogSegment) + 512;
  p = (LogIterator *)sqlite3_malloc(nByte);
  if( p ){
    memset(p, 0, nByte);
    p->nSegment = nSegment;
    p->nFinal = nFinal;
  }

  for(i=0; i<nSegment-1; i++){
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
  logMergesort8(pFinal->aDbPage, aTmp, pFinal->aIndex, &nFinal);
  p->nFinal = nFinal;

  return p;
}

/* 
** Free a log iterator allocated by logCheckpointInit().
*/
static void logCheckpointFree(LogCheckpoint *p){
  sqlite3_free(p);
}

/*
** Checkpoint the contents of the log file.
*/
static int logCheckpoint(
  Log *pLog,                      /* Log connection */
  sqlite3_file *pFd,              /* File descriptor open on db file */
  u8 *zBuf                        /* Temporary buffer to use */
){
  int rc;                         /* Return code */
  int pgsz = pLog->hdr.pgsz;      /* Database page-size */
  LogCheckpoint *pIter = 0;       /* Log iterator context */
  u32 iDbpage = 0;                /* Next database page to write */
  u32 iFrame = 0;                 /* Log frame containing data for iDbpage */

  if( pLog->hdr.iLastPg==0 ){
    return SQLITE_OK;
  }

  /* Allocate the iterator */
  pIter = logCheckpointInit(pLog);
  if( !pIter ) return SQLITE_NOMEM;

  /* Sync the log file to disk */
  rc = sqlite3OsSync(pLog->pFd, pLog->sync_flags);
  if( rc!=SQLITE_OK ) goto out;

  /* Iterate through the contents of the log, copying data to the db file. */
  while( 0==logCheckpointNext(pIter, &iDbpage, &iFrame) ){
    rc = sqlite3OsRead(pLog->pFd, zBuf, pgsz, 
        (iFrame-1) * (pgsz+LOG_FRAME_HDRSIZE) + LOG_FRAME_HDRSIZE
    );
    if( rc!=SQLITE_OK ) goto out;
    rc = sqlite3OsWrite(pFd, zBuf, pgsz, (iDbpage-1)*pgsz);
    if( rc!=SQLITE_OK ) goto out;
  }







|

|













|








|







|







784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
  logMergesort8(pFinal->aDbPage, aTmp, pFinal->aIndex, &nFinal);
  p->nFinal = nFinal;

  return p;
}

/* 
** Free a log iterator allocated by logIteratorInit().
*/
static void logIteratorFree(LogIterator *p){
  sqlite3_free(p);
}

/*
** Checkpoint the contents of the log file.
*/
static int logCheckpoint(
  Log *pLog,                      /* Log connection */
  sqlite3_file *pFd,              /* File descriptor open on db file */
  u8 *zBuf                        /* Temporary buffer to use */
){
  int rc;                         /* Return code */
  int pgsz = pLog->hdr.pgsz;      /* Database page-size */
  LogIterator *pIter = 0;         /* Log iterator context */
  u32 iDbpage = 0;                /* Next database page to write */
  u32 iFrame = 0;                 /* Log frame containing data for iDbpage */

  if( pLog->hdr.iLastPg==0 ){
    return SQLITE_OK;
  }

  /* Allocate the iterator */
  pIter = logIteratorInit(pLog);
  if( !pIter ) return SQLITE_NOMEM;

  /* Sync the log file to disk */
  rc = sqlite3OsSync(pLog->pFd, pLog->sync_flags);
  if( rc!=SQLITE_OK ) goto out;

  /* Iterate through the contents of the log, copying data to the db file. */
  while( 0==logIteratorNext(pIter, &iDbpage, &iFrame) ){
    rc = sqlite3OsRead(pLog->pFd, zBuf, pgsz, 
        (iFrame-1) * (pgsz+LOG_FRAME_HDRSIZE) + LOG_FRAME_HDRSIZE
    );
    if( rc!=SQLITE_OK ) goto out;
    rc = sqlite3OsWrite(pFd, zBuf, pgsz, (iDbpage-1)*pgsz);
    if( rc!=SQLITE_OK ) goto out;
  }
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
  memset(zBuf, 0, LOG_FRAME_HDRSIZE);
  rc = sqlite3OsWrite(pLog->pFd, zBuf, LOG_FRAME_HDRSIZE, 0);
  if( rc!=SQLITE_OK ) goto out;
  rc = sqlite3OsSync(pLog->pFd, pLog->sync_flags);
#endif

 out:
  logCheckpointFree(pIter);
  return rc;
}

/*
** Close a connection to a log file.
*/
int sqlite3LogClose(







|







859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
  memset(zBuf, 0, LOG_FRAME_HDRSIZE);
  rc = sqlite3OsWrite(pLog->pFd, zBuf, LOG_FRAME_HDRSIZE, 0);
  if( rc!=SQLITE_OK ) goto out;
  rc = sqlite3OsSync(pLog->pFd, pLog->sync_flags);
#endif

 out:
  logIteratorFree(pIter);
  return rc;
}

/*
** Close a connection to a log file.
*/
int sqlite3LogClose(
Changes to src/pager.c.
3249
3250
3251
3252
3253
3254
3255

3256
3257
3258
3259
3260
3261
3262
3263
3264
3265
static int pagerStress(void *p, PgHdr *pPg){
  Pager *pPager = (Pager *)p;
  int rc = SQLITE_OK;

  assert( pPg->pPager==pPager );
  assert( pPg->flags&PGHDR_DIRTY );


  if( pagerUseLog(pPager) ){
    /* Write a single frame for this page to the log. */
    assert( pPg->pDirty==0 );
    rc = sqlite3LogFrames(pPager->pLog, pPager->pageSize, pPg, 0, 0, 0);
  }else{
    /* The doNotSync flag is set by the sqlite3PagerWrite() function while it
    ** is journalling a set of two or more database pages that are stored
    ** on the same disk sector. Syncing the journal is not allowed while
    ** this is happening as it is important that all members of such a
    ** set of pages are synced to disk together. So, if the page this function







>


<







3249
3250
3251
3252
3253
3254
3255
3256
3257
3258

3259
3260
3261
3262
3263
3264
3265
static int pagerStress(void *p, PgHdr *pPg){
  Pager *pPager = (Pager *)p;
  int rc = SQLITE_OK;

  assert( pPg->pPager==pPager );
  assert( pPg->flags&PGHDR_DIRTY );

  pPg->pDirty = 0;
  if( pagerUseLog(pPager) ){
    /* Write a single frame for this page to the log. */

    rc = sqlite3LogFrames(pPager->pLog, pPager->pageSize, pPg, 0, 0, 0);
  }else{
    /* The doNotSync flag is set by the sqlite3PagerWrite() function while it
    ** is journalling a set of two or more database pages that are stored
    ** on the same disk sector. Syncing the journal is not allowed while
    ** this is happening as it is important that all members of such a
    ** set of pages are synced to disk together. So, if the page this function
3320
3321
3322
3323
3324
3325
3326
3327
3328
3329
3330
3331
3332
3333
3334
        rc==SQLITE_OK && pPg->pgno>pPager->dbSize && subjRequiresPage(pPg)
    ) ){
      rc = subjournalPage(pPg);
    }
  
    /* Write the contents of the page out to the database file. */
    if( rc==SQLITE_OK ){
      pPg->pDirty = 0;
      rc = pager_write_pagelist(pPg);
    }
  }

  /* Mark the page as clean. */
  if( rc==SQLITE_OK ){
    PAGERTRACE(("STRESS %d page %d\n", PAGERID(pPager), pPg->pgno));







<







3320
3321
3322
3323
3324
3325
3326

3327
3328
3329
3330
3331
3332
3333
        rc==SQLITE_OK && pPg->pgno>pPager->dbSize && subjRequiresPage(pPg)
    ) ){
      rc = subjournalPage(pPg);
    }
  
    /* Write the contents of the page out to the database file. */
    if( rc==SQLITE_OK ){

      rc = pager_write_pagelist(pPg);
    }
  }

  /* Mark the page as clean. */
  if( rc==SQLITE_OK ){
    PAGERTRACE(("STRESS %d page %d\n", PAGERID(pPager), pPg->pgno));
Changes to test/wal.test.
616
617
618
619
620
621
622


























































623
624
625
    PRAGMA integrity_check;
  }
} {17 ok}
do_test wal-11.14 {
  list [expr [file size test.db]/1024] [expr [file size test.db-wal]/1044]
} {37 38}




























































finish_test








>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>



616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
    PRAGMA integrity_check;
  }
} {17 ok}
do_test wal-11.14 {
  list [expr [file size test.db]/1024] [expr [file size test.db-wal]/1044]
} {37 38}


#-------------------------------------------------------------------------
# This block of tests, wal-12.*, tests a problem...
#
reopen_db
do_test wal-12.1 {
  execsql {
    PRAGMA page_size = 1024;
    CREATE TABLE t1(x, y);
    CREATE TABLE t2(x, y);
    INSERT INTO t1 VALUES('A', 1);
  }
  list [expr [file size test.db]/1024] [expr [file size test.db-wal]/1044]
} {0 5}
do_test wal-12.2 {
  db close
  sqlite3_wal db test.db
  execsql {
    UPDATE t1 SET y = 0 WHERE x = 'A';
  }
  list [expr [file size test.db]/1024] [expr [file size test.db-wal]/1044]
} {3 1}
do_test wal-12.3 {
  execsql { INSERT INTO t2 VALUES('B', 1) }
  list [expr [file size test.db]/1024] [expr [file size test.db-wal]/1044]
} {3 2}

do_test wal-12.4 {
  file copy -force test.db test2.db
  file copy -force test.db-wal test2.db-wal
  sqlite3_wal db2 test2.db
breakpoint
  execsql { SELECT * FROM t2 } db2
} {B 1}
db2 close

file copy -force test.db-wal A
do_test wal-12.5 {
  execsql {
    PRAGMA checkpoint;
    UPDATE t2 SET y = 2 WHERE x = 'B'; 
    PRAGMA checkpoint;
    UPDATE t1 SET y = 1 WHERE x = 'A';
    PRAGMA checkpoint;
    UPDATE t1 SET y = 0 WHERE x = 'A';
    SELECT * FROM t2;
  }
} {B 2}
file copy -force test.db-wal B

do_test wal-12.4 {
  file copy -force test.db test2.db
  file copy -force test.db-wal test2.db-wal
  sqlite3_wal db2 test2.db
  execsql { SELECT * FROM t2 } db2
} {B 2}
db2 close


finish_test