SQLite

Check-in [68e7a8c676]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Simplify the accessPayload() routine so that it always populates the overflow page cache. In the one case where populating the page cache can lead to problems, simply invalidate the cache as soon as accessPayload() returns. This simplification reduces code size and helps accessPayload() to run a little faster. This backs out the eOp==2 mode of accessPayload() added by check-in [da59198505].
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: 68e7a8c6765649195ef1ad9407d87d44a307b462
User & Date: drh 2017-01-27 00:31:59.289
Context
2017-01-27
01:13
Performance optimization in accessPayload(). (check-in: ebb1fd98d4 user: drh tags: trunk)
00:31
Simplify the accessPayload() routine so that it always populates the overflow page cache. In the one case where populating the page cache can lead to problems, simply invalidate the cache as soon as accessPayload() returns. This simplification reduces code size and helps accessPayload() to run a little faster. This backs out the eOp==2 mode of accessPayload() added by check-in [da59198505]. (check-in: 68e7a8c676 user: drh tags: trunk)
2017-01-26
21:30
Remove an unreachable branch in the error handling logic for sqlite3BtreePayloadChecked(). (check-in: 293bf3ed7e user: drh tags: trunk)
Changes
Unified Diff Ignore Whitespace Patch
Changes to src/btree.c.
4420
4421
4422
4423
4424
4425
4426
4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
4448
/*
** This function is used to read or overwrite payload information
** for the entry that the pCur cursor is pointing to. The eOp
** argument is interpreted as follows:
**
**   0: The operation is a read. Populate the overflow cache.
**   1: The operation is a write. Populate the overflow cache.
**   2: The operation is a read. Do not populate the overflow cache.
**
** A total of "amt" bytes are read or written beginning at "offset".
** Data is read to or from the buffer pBuf.
**
** The content being read or written might appear on the main page
** or be scattered out on multiple overflow pages.
**
** If the current cursor entry uses one or more overflow pages and the
** eOp argument is not 2, this function may allocate space for and lazily 
** populates the overflow page-list cache array (BtCursor.aOverflow). 
** Subsequent calls use this cache to make seeking to the supplied offset 
** more efficient.
**
** Once an overflow page-list cache has been allocated, it may be
** invalidated if some other cursor writes to the same table, or if
** the cursor is moved to a different row. Additionally, in auto-vacuum
** mode, the following events may invalidate an overflow page-list cache.
**
**   * An incremental vacuum,
**   * A commit in auto_vacuum="full" mode,
**   * Creating a table (may require moving an overflow page).







<







|
|
|



|







4420
4421
4422
4423
4424
4425
4426

4427
4428
4429
4430
4431
4432
4433
4434
4435
4436
4437
4438
4439
4440
4441
4442
4443
4444
4445
4446
4447
/*
** This function is used to read or overwrite payload information
** for the entry that the pCur cursor is pointing to. The eOp
** argument is interpreted as follows:
**
**   0: The operation is a read. Populate the overflow cache.
**   1: The operation is a write. Populate the overflow cache.

**
** A total of "amt" bytes are read or written beginning at "offset".
** Data is read to or from the buffer pBuf.
**
** The content being read or written might appear on the main page
** or be scattered out on multiple overflow pages.
**
** If the current cursor entry uses one or more overflow pages
** this function may allocate space for and lazily populate
** the overflow page-list cache array (BtCursor.aOverflow). 
** Subsequent calls use this cache to make seeking to the supplied offset 
** more efficient.
**
** Once an overflow page-list cache has been allocated, it must be
** invalidated if some other cursor writes to the same table, or if
** the cursor is moved to a different row. Additionally, in auto-vacuum
** mode, the following events may invalidate an overflow page-list cache.
**
**   * An incremental vacuum,
**   * A commit in auto_vacuum="full" mode,
**   * Creating a table (may require moving an overflow page).
4460
4461
4462
4463
4464
4465
4466

4467
4468
4469
4470
4471
4472
4473
4474
4475
4476
4477
  MemPage *pPage = pCur->apPage[pCur->iPage]; /* Btree page of current entry */
  BtShared *pBt = pCur->pBt;                  /* Btree this cursor belongs to */
#ifdef SQLITE_DIRECT_OVERFLOW_READ
  unsigned char * const pBufStart = pBuf;     /* Start of original out buffer */
#endif

  assert( pPage );

  assert( pCur->eState==CURSOR_VALID );
  assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
  assert( cursorHoldsMutex(pCur) );
  assert( eOp!=2 || offset==0 );    /* Always start from beginning for eOp==2 */

  getCellInfo(pCur);
  aPayload = pCur->info.pPayload;
  assert( offset+amt <= pCur->info.nPayload );

  assert( aPayload > pPage->aData );
  if( (uptr)(aPayload - pPage->aData) > (pBt->usableSize - pCur->info.nLocal) ){







>



<







4459
4460
4461
4462
4463
4464
4465
4466
4467
4468
4469

4470
4471
4472
4473
4474
4475
4476
  MemPage *pPage = pCur->apPage[pCur->iPage]; /* Btree page of current entry */
  BtShared *pBt = pCur->pBt;                  /* Btree this cursor belongs to */
#ifdef SQLITE_DIRECT_OVERFLOW_READ
  unsigned char * const pBufStart = pBuf;     /* Start of original out buffer */
#endif

  assert( pPage );
  assert( eOp==0 || eOp==1 );
  assert( pCur->eState==CURSOR_VALID );
  assert( pCur->aiIdx[pCur->iPage]<pPage->nCell );
  assert( cursorHoldsMutex(pCur) );


  getCellInfo(pCur);
  aPayload = pCur->info.pPayload;
  assert( offset+amt <= pCur->info.nPayload );

  assert( aPayload > pPage->aData );
  if( (uptr)(aPayload - pPage->aData) > (pBt->usableSize - pCur->info.nLocal) ){
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506
4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533
4534
4535
4536
4537
4538
4539
4540
4541
4542
4543
4544
4545
4546
4547
4548
4549
4550
4551
4552
4553
4554
4555
4556
4557
4558
4559
4560
4561
4562
4563
4564
4565
4566
4567
4568
4569
4570
4571

  /* Check if data must be read/written to/from the btree page itself. */
  if( offset<pCur->info.nLocal ){
    int a = amt;
    if( a+offset>pCur->info.nLocal ){
      a = pCur->info.nLocal - offset;
    }
    rc = copyPayload(&aPayload[offset], pBuf, a, (eOp & 0x01), pPage->pDbPage);
    offset = 0;
    pBuf += a;
    amt -= a;
  }else{
    offset -= pCur->info.nLocal;
  }


  if( rc==SQLITE_OK && amt>0 ){
    const u32 ovflSize = pBt->usableSize - 4;  /* Bytes content per ovfl page */
    Pgno nextPage;

    nextPage = get4byte(&aPayload[pCur->info.nLocal]);

    /* If the BtCursor.aOverflow[] has not been allocated, allocate it now.
    ** Except, do not allocate aOverflow[] for eOp==2.
    **
    ** The aOverflow[] array is sized at one entry for each overflow page
    ** in the overflow chain. The page number of the first overflow page is
    ** stored in aOverflow[0], etc. A value of 0 in the aOverflow[] array
    ** means "not yet known" (the cache is lazily populated).
    */
    if( eOp!=2 && (pCur->curFlags & BTCF_ValidOvfl)==0 ){
      int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize;
      if( nOvfl>pCur->nOvflAlloc ){
        Pgno *aNew = (Pgno*)sqlite3Realloc(
            pCur->aOverflow, nOvfl*2*sizeof(Pgno)
        );
        if( aNew==0 ){
          return SQLITE_NOMEM_BKPT;
        }else{
          pCur->nOvflAlloc = nOvfl*2;
          pCur->aOverflow = aNew;
        }
      }
      memset(pCur->aOverflow, 0, nOvfl*sizeof(Pgno));
      pCur->curFlags |= BTCF_ValidOvfl;
    }

    /* If the overflow page-list cache has been allocated and the
    ** entry for the first required overflow page is valid, skip
    ** directly to it.
    */
    if( (pCur->curFlags & BTCF_ValidOvfl)!=0
     && pCur->aOverflow[offset/ovflSize]
    ){
      iIdx = (offset/ovflSize);
      nextPage = pCur->aOverflow[iIdx];
      offset = (offset%ovflSize);
    }

    assert( rc==SQLITE_OK && amt>0 );
    while( nextPage ){
      /* If required, populate the overflow page-list cache. */
      if( (pCur->curFlags & BTCF_ValidOvfl)!=0 ){
        assert( pCur->aOverflow[iIdx]==0
                || pCur->aOverflow[iIdx]==nextPage
                || CORRUPT_DB );
        pCur->aOverflow[iIdx] = nextPage;
      }

      if( offset>=ovflSize ){
        /* The only reason to read this page is to obtain the page
        ** number for the next page in the overflow chain. The page
        ** data is not required. So first try to lookup the overflow
        ** page-list cache, if any, then fall back to the getOverflowPage()
        ** function.
        **
        ** Note that the aOverflow[] array must be allocated because eOp!=2
        ** here.  If eOp==2, then offset==0 and this branch is never taken.
        */
        assert( eOp!=2 );
        assert( pCur->curFlags & BTCF_ValidOvfl );
        assert( pCur->pBtree->db==pBt->db );
        if( pCur->aOverflow[iIdx+1] ){
          nextPage = pCur->aOverflow[iIdx+1];
        }else{
          rc = getOverflowPage(pBt, nextPage, 0, &nextPage);
        }







|















<






|




















<
|
<








<
|
|
|
|
<







<
<
<

<







4484
4485
4486
4487
4488
4489
4490
4491
4492
4493
4494
4495
4496
4497
4498
4499
4500
4501
4502
4503
4504
4505
4506

4507
4508
4509
4510
4511
4512
4513
4514
4515
4516
4517
4518
4519
4520
4521
4522
4523
4524
4525
4526
4527
4528
4529
4530
4531
4532
4533

4534

4535
4536
4537
4538
4539
4540
4541
4542

4543
4544
4545
4546

4547
4548
4549
4550
4551
4552
4553



4554

4555
4556
4557
4558
4559
4560
4561

  /* Check if data must be read/written to/from the btree page itself. */
  if( offset<pCur->info.nLocal ){
    int a = amt;
    if( a+offset>pCur->info.nLocal ){
      a = pCur->info.nLocal - offset;
    }
    rc = copyPayload(&aPayload[offset], pBuf, a, eOp, pPage->pDbPage);
    offset = 0;
    pBuf += a;
    amt -= a;
  }else{
    offset -= pCur->info.nLocal;
  }


  if( rc==SQLITE_OK && amt>0 ){
    const u32 ovflSize = pBt->usableSize - 4;  /* Bytes content per ovfl page */
    Pgno nextPage;

    nextPage = get4byte(&aPayload[pCur->info.nLocal]);

    /* If the BtCursor.aOverflow[] has not been allocated, allocate it now.

    **
    ** The aOverflow[] array is sized at one entry for each overflow page
    ** in the overflow chain. The page number of the first overflow page is
    ** stored in aOverflow[0], etc. A value of 0 in the aOverflow[] array
    ** means "not yet known" (the cache is lazily populated).
    */
    if( (pCur->curFlags & BTCF_ValidOvfl)==0 ){
      int nOvfl = (pCur->info.nPayload-pCur->info.nLocal+ovflSize-1)/ovflSize;
      if( nOvfl>pCur->nOvflAlloc ){
        Pgno *aNew = (Pgno*)sqlite3Realloc(
            pCur->aOverflow, nOvfl*2*sizeof(Pgno)
        );
        if( aNew==0 ){
          return SQLITE_NOMEM_BKPT;
        }else{
          pCur->nOvflAlloc = nOvfl*2;
          pCur->aOverflow = aNew;
        }
      }
      memset(pCur->aOverflow, 0, nOvfl*sizeof(Pgno));
      pCur->curFlags |= BTCF_ValidOvfl;
    }

    /* If the overflow page-list cache has been allocated and the
    ** entry for the first required overflow page is valid, skip
    ** directly to it.
    */

    if( pCur->aOverflow[offset/ovflSize] ){

      iIdx = (offset/ovflSize);
      nextPage = pCur->aOverflow[iIdx];
      offset = (offset%ovflSize);
    }

    assert( rc==SQLITE_OK && amt>0 );
    while( nextPage ){
      /* If required, populate the overflow page-list cache. */

      assert( pCur->aOverflow[iIdx]==0
              || pCur->aOverflow[iIdx]==nextPage
              || CORRUPT_DB );
      pCur->aOverflow[iIdx] = nextPage;


      if( offset>=ovflSize ){
        /* The only reason to read this page is to obtain the page
        ** number for the next page in the overflow chain. The page
        ** data is not required. So first try to lookup the overflow
        ** page-list cache, if any, then fall back to the getOverflowPage()
        ** function.



        */

        assert( pCur->curFlags & BTCF_ValidOvfl );
        assert( pCur->pBtree->db==pBt->db );
        if( pCur->aOverflow[iIdx+1] ){
          nextPage = pCur->aOverflow[iIdx+1];
        }else{
          rc = getOverflowPage(pBt, nextPage, 0, &nextPage);
        }
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
4622
4623
4624
4625
4626
4627
4628
4629
4630
4631
        **   5) the page is not in the WAL file
        **   6) at least 4 bytes have already been read into the output buffer 
        **
        ** then data can be read directly from the database file into the
        ** output buffer, bypassing the page-cache altogether. This speeds
        ** up loading large records that span many overflow pages.
        */
        if( (eOp&0x01)==0                                      /* (1) */
         && offset==0                                          /* (2) */
         && pBt->inTransaction==TRANS_READ                     /* (3) */
         && (fd = sqlite3PagerFile(pBt->pPager))->pMethods     /* (4) */
         && 0==sqlite3PagerUseWal(pBt->pPager, nextPage)       /* (5) */
         && &pBuf[-4]>=pBufStart                               /* (6) */
        ){
          u8 aSave[4];
          u8 *aWrite = &pBuf[-4];
          assert( aWrite>=pBufStart );                         /* due to (6) */
          memcpy(aSave, aWrite, 4);
          rc = sqlite3OsRead(fd, aWrite, a+4, (i64)pBt->pageSize*(nextPage-1));
          nextPage = get4byte(aWrite);
          memcpy(aWrite, aSave, 4);
        }else
#endif

        {
          DbPage *pDbPage;
          rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage,
              ((eOp&0x01)==0 ? PAGER_GET_READONLY : 0)
          );
          if( rc==SQLITE_OK ){
            aPayload = sqlite3PagerGetData(pDbPage);
            nextPage = get4byte(aPayload);
            rc = copyPayload(&aPayload[offset+4], pBuf, a, (eOp&0x01), pDbPage);
            sqlite3PagerUnref(pDbPage);
            offset = 0;
          }
        }
        amt -= a;
        pBuf += a;
      }







|



















|




|







4582
4583
4584
4585
4586
4587
4588
4589
4590
4591
4592
4593
4594
4595
4596
4597
4598
4599
4600
4601
4602
4603
4604
4605
4606
4607
4608
4609
4610
4611
4612
4613
4614
4615
4616
4617
4618
4619
4620
4621
        **   5) the page is not in the WAL file
        **   6) at least 4 bytes have already been read into the output buffer 
        **
        ** then data can be read directly from the database file into the
        ** output buffer, bypassing the page-cache altogether. This speeds
        ** up loading large records that span many overflow pages.
        */
        if( eOp==0                                             /* (1) */
         && offset==0                                          /* (2) */
         && pBt->inTransaction==TRANS_READ                     /* (3) */
         && (fd = sqlite3PagerFile(pBt->pPager))->pMethods     /* (4) */
         && 0==sqlite3PagerUseWal(pBt->pPager, nextPage)       /* (5) */
         && &pBuf[-4]>=pBufStart                               /* (6) */
        ){
          u8 aSave[4];
          u8 *aWrite = &pBuf[-4];
          assert( aWrite>=pBufStart );                         /* due to (6) */
          memcpy(aSave, aWrite, 4);
          rc = sqlite3OsRead(fd, aWrite, a+4, (i64)pBt->pageSize*(nextPage-1));
          nextPage = get4byte(aWrite);
          memcpy(aWrite, aSave, 4);
        }else
#endif

        {
          DbPage *pDbPage;
          rc = sqlite3PagerGet(pBt->pPager, nextPage, &pDbPage,
              (eOp==0 ? PAGER_GET_READONLY : 0)
          );
          if( rc==SQLITE_OK ){
            aPayload = sqlite3PagerGetData(pDbPage);
            nextPage = get4byte(aPayload);
            rc = copyPayload(&aPayload[offset+4], pBuf, a, eOp, pDbPage);
            sqlite3PagerUnref(pDbPage);
            offset = 0;
          }
        }
        amt -= a;
        pBuf += a;
      }
5249
5250
5251
5252
5253
5254
5255
5256

5257
5258
5259
5260
5261
5262
5263
          }
          pCellKey = sqlite3Malloc( nCell+18 );
          if( pCellKey==0 ){
            rc = SQLITE_NOMEM_BKPT;
            goto moveto_finish;
          }
          pCur->aiIdx[pCur->iPage] = (u16)idx;
          rc = accessPayload(pCur, 0, nCell, (unsigned char*)pCellKey, 2);

          if( rc ){
            sqlite3_free(pCellKey);
            goto moveto_finish;
          }
          c = xRecordCompare(nCell, pCellKey, pIdxKey);
          sqlite3_free(pCellKey);
        }







|
>







5239
5240
5241
5242
5243
5244
5245
5246
5247
5248
5249
5250
5251
5252
5253
5254
          }
          pCellKey = sqlite3Malloc( nCell+18 );
          if( pCellKey==0 ){
            rc = SQLITE_NOMEM_BKPT;
            goto moveto_finish;
          }
          pCur->aiIdx[pCur->iPage] = (u16)idx;
          rc = accessPayload(pCur, 0, nCell, (unsigned char*)pCellKey, 0);
          pCur->curFlags &= ~BTCF_ValidOvfl;
          if( rc ){
            sqlite3_free(pCellKey);
            goto moveto_finish;
          }
          c = xRecordCompare(nCell, pCellKey, pIdxKey);
          sqlite3_free(pCellKey);
        }