/ Check-in [778e8a10]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Use a different free-list format for server-mode databases in order to reduce contention.
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | server-edition
Files: files | file ages | folders
SHA3-256: 778e8a102d8dc7b0fa006c2d90b0a8cd36ebc1e16bd17477f2d536fc0cef4bf3
User & Date: dan 2017-04-28 10:20:03
Context
2017-04-28
14:09
Add a missing releasePage() call to the server-mode free-list management code. check-in: a5a08548 user: dan tags: server-edition
10:20
Use a different free-list format for server-mode databases in order to reduce contention. check-in: 778e8a10 user: dan tags: server-edition
2017-04-27
14:12
Do not write master journal filenames into server-mode journal files. Use SQLITE_MUTEX_STATIC_APP1 to protect critical sections in server.c. check-in: 3144ae40 user: dan tags: server-edition
Changes
Hide Diffs Side-by-Side Diffs Ignore Whitespace Patch

Changes to src/btree.c.

  5612   5612     ){
  5613   5613       return btreePrevious(pCur, pRes);
  5614   5614     }
  5615   5615     pCur->ix--;
  5616   5616     return SQLITE_OK;
  5617   5617   }
  5618   5618   
         5619  +#ifdef SQLITE_SERVER_EDITION
         5620  +
         5621  +#define SERVER_DEFAULT_FREELISTS      16
         5622  +#define SERVER_DEFAULT_FREELIST_SIZE 128
         5623  +
         5624  +/*
         5625  +** Allocate the free-node and the first SERVER_DEFAULT_FREELISTS 
         5626  +** trunk pages.
         5627  +*/
         5628  +static int allocateServerFreenode(BtShared *pBt){
         5629  +  int rc;
         5630  +  MemPage *pPage1 = pBt->pPage1;
         5631  +
         5632  +  rc = sqlite3PagerWrite(pPage1->pDbPage);
         5633  +  if( rc==SQLITE_OK ){
         5634  +    Pgno pgnoNode = (++pBt->nPage);
         5635  +    MemPage *pNode = 0;
         5636  +    int i;
         5637  +
         5638  +    put4byte(&pPage1->aData[32], pgnoNode);
         5639  +    rc = btreeGetUnusedPage(pBt, pgnoNode, &pNode, PAGER_GET_NOCONTENT);
         5640  +    if( rc==SQLITE_OK ){
         5641  +      rc = sqlite3PagerWrite(pNode->pDbPage);
         5642  +    }
         5643  +    if( rc==SQLITE_OK ){
         5644  +      put4byte(&pNode->aData[0], 0);
         5645  +      put4byte(&pNode->aData[4], SERVER_DEFAULT_FREELISTS);
         5646  +    }
         5647  +    for(i=0; rc==SQLITE_OK && i<SERVER_DEFAULT_FREELISTS; i++){
         5648  +      MemPage *pTrunk = 0;
         5649  +      Pgno pgnoTrunk;
         5650  +      if( ++pBt->nPage==PENDING_BYTE_PAGE(pBt) ) pBt->nPage++;
         5651  +      pgnoTrunk = pBt->nPage;
         5652  +
         5653  +      rc = btreeGetUnusedPage(pBt, pgnoTrunk, &pTrunk, PAGER_GET_NOCONTENT);
         5654  +      if( rc==SQLITE_OK ){
         5655  +        rc = sqlite3PagerWrite(pTrunk->pDbPage);
         5656  +      }
         5657  +      if( rc==SQLITE_OK ){
         5658  +        memset(pTrunk->aData, 0, 8);
         5659  +        put4byte(&pNode->aData[8+i*4], pgnoTrunk);
         5660  +      }
         5661  +      releasePage(pTrunk);
         5662  +    }
         5663  +    releasePage(pNode);
         5664  +  }
         5665  +
         5666  +  return rc;
         5667  +}
         5668  +
         5669  +/*
         5670  +** Return a reference to the first trunk page in one of the database free-lists.
         5671  +** Allocate the database free-lists if required.
         5672  +*/
         5673  +static int findServerTrunk(BtShared *pBt, int bAlloc, MemPage **ppTrunk){
         5674  +  MemPage *pPage1 = pBt->pPage1;
         5675  +  MemPage *pNode = 0;             /* The node page */
         5676  +  MemPage *pTrunk = 0;            /* The returned page */
         5677  +  Pgno iNode;                     /* Page number of node page */
         5678  +  int rc = SQLITE_OK;
         5679  +
         5680  +  /* If the node page and free-list trunks have not yet been allocated, allocate
         5681  +  ** them now.  */
         5682  +  pPage1 = pBt->pPage1;
         5683  +  iNode = get4byte(&pPage1->aData[32]);
         5684  +  if( iNode==0 ){
         5685  +    rc = allocateServerFreenode(pBt);
         5686  +    iNode = get4byte(&pPage1->aData[32]);
         5687  +  }
         5688  +
         5689  +  /* Grab the node page */
         5690  +  if( rc==SQLITE_OK ){
         5691  +    rc = btreeGetUnusedPage(pBt, iNode, &pNode, 0);
         5692  +  }
         5693  +  if( rc==SQLITE_OK ){
         5694  +    int nList;                    /* Number of free-lists in this db */
         5695  +    int i;
         5696  +
         5697  +    /* Try to lock a free-list trunk. If bAlloc is true, it has to be a
         5698  +    ** free-list trunk with at least one entry in the free-list. */
         5699  +    nList = (int)get4byte(&pNode->aData[4]);
         5700  +    for(i=0; i<nList; i++){
         5701  +      Pgno iTrunk = get4byte(&pNode->aData[8+i*4]);
         5702  +      if( SQLITE_OK==sqlite3PagerWritelock(pBt->pPager, iTrunk) ){
         5703  +        rc = btreeGetUnusedPage(pBt, iTrunk, &pTrunk, 0);
         5704  +        if( rc==SQLITE_OK && bAlloc ){
         5705  +          if( !get4byte(&pTrunk->aData[0]) && !get4byte(&pTrunk->aData[4]) ){
         5706  +            releasePage(pTrunk);
         5707  +            pTrunk = 0;
         5708  +          }
         5709  +        }
         5710  +        if( rc!=SQLITE_OK || pTrunk ) break;
         5711  +      }
         5712  +    }
         5713  +
         5714  +    /* No free pages in any free-list. Or perhaps we were locked out. In 
         5715  +    ** either case, try to allocate more from the end of the file now.  */
         5716  +    if( i==nList ){
         5717  +      assert( rc==SQLITE_OK && pTrunk==0 );
         5718  +      rc = sqlite3PagerWrite(pPage1->pDbPage);
         5719  +      for(i=0; rc==SQLITE_OK && i<nList; i++){
         5720  +        /* Add some free pages to each free-list. No server-locks are required
         5721  +        ** to do this as we have a write-lock on page 1 - guaranteeing
         5722  +        ** exclusive access to the db file.  */
         5723  +        MemPage *pT = 0;
         5724  +        Pgno iTrunk = get4byte(&pNode->aData[8+i*4]);
         5725  +        rc = btreeGetUnusedPage(pBt, iTrunk, &pT, 0);
         5726  +        if( rc==SQLITE_OK ){
         5727  +          rc = sqlite3PagerWrite(pT->pDbPage);
         5728  +        }
         5729  +        if( rc==SQLITE_OK ){
         5730  +          int iPg = get4byte(&pT->aData[4]);
         5731  +          for(/*no-op*/; iPg<SERVER_DEFAULT_FREELIST_SIZE; iPg++){
         5732  +            if( ++pBt->nPage==PENDING_BYTE_PAGE(pBt) ) pBt->nPage++;
         5733  +            put4byte(&pT->aData[8+iPg*4], pBt->nPage);
         5734  +          }
         5735  +          put4byte(&pT->aData[4], iPg);
         5736  +          if( pTrunk==0 ){
         5737  +            pTrunk = pT;
         5738  +            pT = 0;
         5739  +          }
         5740  +        }
         5741  +        releasePage(pT);
         5742  +      }
         5743  +      if( rc==SQLITE_OK ){
         5744  +        MemPage *pLast = 0;
         5745  +        rc = btreeGetUnusedPage(pBt, pBt->nPage, &pLast, 0);
         5746  +        if( rc==SQLITE_OK ){
         5747  +          rc = sqlite3PagerWrite(pLast->pDbPage);
         5748  +          releasePage(pLast);
         5749  +          put4byte(28 + (u8*)pPage1->aData, pBt->nPage);
         5750  +        }
         5751  +      }
         5752  +    }
         5753  +  }
         5754  +
         5755  +  releasePage(pNode);
         5756  +  if( rc==SQLITE_OK ){
         5757  +    assert( pTrunk );
         5758  +    rc = sqlite3PagerWrite(pTrunk->pDbPage);
         5759  +  }
         5760  +  if( rc!=SQLITE_OK ){
         5761  +    releasePage(pTrunk);
         5762  +    pTrunk = 0;
         5763  +  }
         5764  +  *ppTrunk = pTrunk;
         5765  +  return rc;
         5766  +}
         5767  +
         5768  +static int allocateServerPage(
         5769  +  BtShared *pBt,         /* The btree */
         5770  +  MemPage **ppPage,      /* Store pointer to the allocated page here */
         5771  +  Pgno *pPgno,           /* Store the page number here */
         5772  +  Pgno nearby,           /* Search for a page near this one */
         5773  +  u8 eMode               /* BTALLOC_EXACT, BTALLOC_LT, or BTALLOC_ANY */
         5774  +){
         5775  +  int rc;                         /* Return code */
         5776  +  MemPage *pTrunk = 0;            /* The node page */
         5777  +  Pgno pgnoNew = 0;
         5778  +
         5779  +  assert( eMode==BTALLOC_ANY );
         5780  +  assert( sqlite3_mutex_held(pBt->mutex) );
         5781  +
         5782  +  rc = findServerTrunk(pBt, 1, &pTrunk);
         5783  +  if( rc==SQLITE_OK ){
         5784  +    int nFree;              /* Number of free pages on this trunk page */
         5785  +    nFree = (int)get4byte(&pTrunk->aData[4]);
         5786  +    if( nFree==0 ){
         5787  +      pgnoNew = get4byte(&pTrunk->aData[0]);
         5788  +      assert( pgnoNew );
         5789  +    }else{
         5790  +      nFree--;
         5791  +      pgnoNew = get4byte(&pTrunk->aData[8+4*nFree]);
         5792  +      put4byte(&pTrunk->aData[4], (u32)nFree);
         5793  +      releasePage(pTrunk);
         5794  +      pTrunk = 0;
         5795  +    }
         5796  +  }
         5797  +
         5798  +  if( rc==SQLITE_OK ){
         5799  +    MemPage *pNew = 0;
         5800  +    rc = btreeGetUnusedPage(pBt, pgnoNew, &pNew, pTrunk?0:PAGER_GET_NOCONTENT);
         5801  +    if( rc==SQLITE_OK ){
         5802  +      rc = sqlite3PagerWrite(pNew->pDbPage);
         5803  +    }
         5804  +    if( rc==SQLITE_OK && pTrunk ){
         5805  +      memcpy(pTrunk->aData, pNew->aData, pBt->usableSize);
         5806  +    }
         5807  +    *ppPage = pNew;
         5808  +    *pPgno = pgnoNew;
         5809  +  }
         5810  +
         5811  +  releasePage(pTrunk);
         5812  +  return rc;
         5813  +}
         5814  +
         5815  +static int freeServerPage2(BtShared *pBt, MemPage *pPage, Pgno iPage){
         5816  +  int rc;                         /* Return code */
         5817  +  MemPage *pTrunk = 0;            /* The node page */
         5818  +
         5819  +  assert( sqlite3_mutex_held(pBt->mutex) );
         5820  +
         5821  +  rc = findServerTrunk(pBt, 0, &pTrunk);
         5822  +  if( rc==SQLITE_OK ){
         5823  +    int nFree;              /* Number of free pages on this trunk page */
         5824  +    nFree = (int)get4byte(&pTrunk->aData[4]);
         5825  +    if( nFree>=((pBt->usableSize / 4) - 2) ){
         5826  +      if( pPage==0 ){
         5827  +        rc = btreeGetUnusedPage(pBt, iPage, &pPage, 0);
         5828  +      }else{
         5829  +        sqlite3PagerRef(pPage->pDbPage);
         5830  +      }
         5831  +      rc = sqlite3PagerWrite(pPage->pDbPage);
         5832  +      if( rc==SQLITE_OK ){
         5833  +        memcpy(pPage->aData, pTrunk->aData, pBt->usableSize);
         5834  +        put4byte(&pTrunk->aData[0], iPage);
         5835  +        put4byte(&pTrunk->aData[4], 0);
         5836  +      }
         5837  +      releasePage(pPage);
         5838  +    }else{
         5839  +      put4byte(&pTrunk->aData[8+nFree*4], iPage);
         5840  +      put4byte(&pTrunk->aData[4], (u32)nFree+1);
         5841  +    }
         5842  +  }
         5843  +
         5844  +  return rc;
         5845  +}
         5846  +
         5847  +#else
         5848  +# define allocateServerPage(v, w, x, y, z) SQLITE_OK
         5849  +# define freeServerPage2(x, y, z) SQLITE_OK
         5850  +#endif /* SQLITE_SERVER_EDITION */
         5851  +
  5619   5852   /*
  5620   5853   ** Allocate a new page from the database file.
  5621   5854   **
  5622   5855   ** The new page is marked as dirty.  (In other words, sqlite3PagerWrite()
  5623   5856   ** has already been called on the new page.)  The new page has also
  5624   5857   ** been referenced and the calling routine is responsible for calling
  5625   5858   ** sqlite3PagerUnref() on the new page when it is done.
................................................................................
  5648   5881     MemPage *pPage1;
  5649   5882     int rc;
  5650   5883     u32 n;     /* Number of pages on the freelist */
  5651   5884     u32 k;     /* Number of leaves on the trunk of the freelist */
  5652   5885     MemPage *pTrunk = 0;
  5653   5886     MemPage *pPrevTrunk = 0;
  5654   5887     Pgno mxPage;     /* Total size of the database file */
         5888  +
         5889  +  if( sqlite3PagerIsServer(pBt->pPager) ){
         5890  +    return allocateServerPage(pBt, ppPage, pPgno, nearby, eMode); 
         5891  +  }
  5655   5892   
  5656   5893     assert( sqlite3_mutex_held(pBt->mutex) );
  5657   5894     assert( eMode==BTALLOC_ANY || (nearby>0 && IfNotOmitAV(pBt->autoVacuum)) );
  5658   5895     pPage1 = pBt->pPage1;
  5659   5896     mxPage = btreePagecount(pBt);
  5660   5897     /* EVIDENCE-OF: R-05119-02637 The 4-byte big-endian integer at offset 36
  5661   5898     ** stores stores the total number of pages on the freelist. */
................................................................................
  5976   6213     if( pMemPage ){
  5977   6214       pPage = pMemPage;
  5978   6215       sqlite3PagerRef(pPage->pDbPage);
  5979   6216     }else{
  5980   6217       pPage = btreePageLookup(pBt, iPage);
  5981   6218     }
  5982   6219   
  5983         -  /* Increment the free page count on pPage1 */
  5984         -  rc = sqlite3PagerWrite(pPage1->pDbPage);
  5985         -  if( rc ) goto freepage_out;
  5986         -  nFree = get4byte(&pPage1->aData[36]);
  5987         -  put4byte(&pPage1->aData[36], nFree+1);
  5988         -
  5989   6220     if( pBt->btsFlags & BTS_SECURE_DELETE ){
  5990   6221       /* If the secure_delete option is enabled, then
  5991   6222       ** always fully overwrite deleted information with zeros.
  5992   6223       */
  5993   6224       if( (!pPage && ((rc = btreeGetPage(pBt, iPage, &pPage, 0))!=0) )
  5994   6225        ||            ((rc = sqlite3PagerWrite(pPage->pDbPage))!=0)
  5995   6226       ){
  5996   6227         goto freepage_out;
  5997   6228       }
  5998   6229       memset(pPage->aData, 0, pPage->pBt->pageSize);
  5999   6230     }
         6231  +  
         6232  +  if( sqlite3PagerIsServer(pBt->pPager) ){
         6233  +    rc = freeServerPage2(pBt, pPage, iPage);
         6234  +    goto freepage_out;
         6235  +  }
         6236  +
         6237  +  /* Increment the free page count on pPage1 */
         6238  +  rc = sqlite3PagerWrite(pPage1->pDbPage);
         6239  +  if( rc ) goto freepage_out;
         6240  +  nFree = get4byte(&pPage1->aData[36]);
         6241  +  put4byte(&pPage1->aData[36], nFree+1);
  6000   6242   
  6001   6243     /* If the database supports auto-vacuum, write an entry in the pointer-map
  6002   6244     ** to indicate that the page is free.
  6003   6245     */
  6004   6246     if( ISAUTOVACUUM ){
  6005   6247       ptrmapPut(pBt, iPage, PTRMAP_FREEPAGE, 0, &rc);
  6006   6248       if( rc ) goto freepage_out;
................................................................................
  9438   9680     pCheck->v1 = saved_v1;
  9439   9681     pCheck->v2 = saved_v2;
  9440   9682     return depth+1;
  9441   9683   }
  9442   9684   #endif /* SQLITE_OMIT_INTEGRITY_CHECK */
  9443   9685   
  9444   9686   #ifndef SQLITE_OMIT_INTEGRITY_CHECK
         9687  +
         9688  +#if !defined(SQLITE_OMIT_INTEGRITY_CHECK) && defined(SQLITE_SERVER_EDITION)
         9689  +static void checkServerList(IntegrityCk *pCheck){
         9690  +  u32 pgnoNode = get4byte(&pCheck->pBt->pPage1->aData[32]);
         9691  +  if( pgnoNode ){
         9692  +    DbPage *pNode = 0;
         9693  +    u8 *aNodeData;
         9694  +    u32 nList;                    /* Number of free-lists */
         9695  +    int i;
         9696  +
         9697  +    checkRef(pCheck, pgnoNode);
         9698  +    if( sqlite3PagerGet(pCheck->pPager, (Pgno)pgnoNode, &pNode, 0) ){
         9699  +      checkAppendMsg(pCheck, "failed to get node page %d", pgnoNode);
         9700  +      return;
         9701  +    }
         9702  +    aNodeData = sqlite3PagerGetData(pNode);
         9703  +    nList = get4byte(&aNodeData[4]);
         9704  +    for(i=0; i<nList; i++){
         9705  +      u32 pgnoTrunk = get4byte(&aNodeData[8+4*i]);
         9706  +      while( pgnoTrunk ){
         9707  +        DbPage *pTrunk = 0;
         9708  +        checkRef(pCheck, pgnoTrunk);
         9709  +        if( sqlite3PagerGet(pCheck->pPager, (Pgno)pgnoTrunk, &pTrunk, 0) ){
         9710  +          checkAppendMsg(pCheck, "failed to get page %d", pgnoTrunk);
         9711  +          pgnoTrunk = 0;
         9712  +        }else{
         9713  +          u8 *aTrunkData = sqlite3PagerGetData(pTrunk);
         9714  +          int nLeaf = (int)get4byte(&aTrunkData[4]);
         9715  +          int iLeaf;
         9716  +          for(iLeaf=0; iLeaf<nLeaf; iLeaf++){
         9717  +            u32 pgnoLeaf = get4byte(&aTrunkData[8+iLeaf*4]);
         9718  +            checkRef(pCheck, pgnoLeaf);
         9719  +          }
         9720  +          pgnoTrunk = get4byte(&aTrunkData[0]);
         9721  +          sqlite3PagerUnref(pTrunk);
         9722  +        }
         9723  +      }
         9724  +    }
         9725  +
         9726  +    sqlite3PagerUnref(pNode);
         9727  +  }
         9728  +}
         9729  +#endif
  9445   9730   /*
  9446   9731   ** This routine does a complete check of the given BTree file.  aRoot[] is
  9447   9732   ** an array of pages numbers were each page number is the root page of
  9448   9733   ** a table.  nRoot is the number of entries in aRoot.
  9449   9734   **
  9450   9735   ** A read-only or read-write transaction must be opened before calling
  9451   9736   ** this function.
................................................................................
  9503   9788   
  9504   9789     i = PENDING_BYTE_PAGE(pBt);
  9505   9790     if( i<=sCheck.nPage ) setPageReferenced(&sCheck, i);
  9506   9791   
  9507   9792     /* Check the integrity of the freelist
  9508   9793     */
  9509   9794     sCheck.zPfx = "Main freelist: ";
  9510         -  checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),
  9511         -            get4byte(&pBt->pPage1->aData[36]));
         9795  +#ifdef SQLITE_SERVER_EDITION
         9796  +  if( sqlite3PagerIsServer(pBt->pPager) ){
         9797  +    checkServerList(&sCheck);
         9798  +  }else
         9799  +#endif
         9800  +  {
         9801  +    checkList(&sCheck, 1, get4byte(&pBt->pPage1->aData[32]),
         9802  +        get4byte(&pBt->pPage1->aData[36]));
         9803  +  }
  9512   9804     sCheck.zPfx = 0;
  9513   9805   
  9514   9806     /* Check all the tables.
  9515   9807     */
  9516   9808     testcase( pBt->db->flags & SQLITE_CellSizeCk );
  9517   9809     pBt->db->flags &= ~SQLITE_CellSizeCk;
  9518   9810     for(i=0; (int)i<nRoot && sCheck.mxErr; i++){

Changes to src/pager.c.

  7656   7656   ** is empty, return 0.
  7657   7657   */
  7658   7658   int sqlite3PagerWalFramesize(Pager *pPager){
  7659   7659     assert( pPager->eState>=PAGER_READER );
  7660   7660     return sqlite3WalFramesize(pPager->pWal);
  7661   7661   }
  7662   7662   #endif
         7663  +
         7664  +#ifdef SQLITE_SERVER_EDITION
         7665  +int sqlite3PagerIsServer(Pager *pPager){
         7666  +  return pagerIsServer(pPager);
         7667  +}
         7668  +int sqlite3PagerWritelock(Pager *pPager, Pgno pgno){
         7669  +  return sqlite3ServerLock(pPager->pServer, pgno, 1);
         7670  +}
         7671  +#endif
  7663   7672   
  7664   7673   #endif /* SQLITE_OMIT_DISKIO */

Changes to src/pager.h.

   234    234   #else
   235    235   # define disable_simulated_io_errors()
   236    236   # define enable_simulated_io_errors()
   237    237   #endif
   238    238   
   239    239   #ifdef SQLITE_SERVER_EDITION
   240    240     int sqlite3PagerRollbackJournal(Pager*, int);
          241  +  int sqlite3PagerIsServer(Pager *pPager);
          242  +  int sqlite3PagerWritelock(Pager *pPager, Pgno);
   241    243   #endif
   242    244   
   243    245   #endif /* SQLITE_PAGER_H */

Changes to src/server.c.

   476    476             sqlite3_log(SQLITE_BUSY_DEADLOCK, "Conflict at page %d", (int)pgno);
   477    477             rc = SQLITE_BUSY_DEADLOCK;
   478    478             goto server_lock_out;
   479    479           }
   480    480           v = *pSlot;
   481    481         }
   482    482   
          483  +      n = v | (1 << p->iClient);
   483    484         if( bWrite ){
   484         -        n = v | ((p->iClient+1) << HMA_CLIENT_SLOTS);
   485         -      }else{
   486         -        n = v | (1 << p->iClient);
          485  +        n = n | ((p->iClient+1) << HMA_CLIENT_SLOTS);
   487    486         }
   488    487         if( __sync_val_compare_and_swap(pSlot, v, n)==v ) break;
   489    488         v = *pSlot;
   490    489       }
   491    490     }
   492    491   
   493    492   server_lock_out:
   494    493     return rc;
   495    494   }
   496    495   
   497    496   #endif /* ifdef SQLITE_SERVER_EDITION */

Changes to test/server2.test.

    37     37     db2 eval { CREATE TABLE t3(a, b) }
    38     38     file exists test.db-hma
    39     39   } {1}
    40     40   do_test 1.4 {
    41     41     db2 close
    42     42     file exists test.db-hma
    43     43   } {1}
    44         -do_test 1.5 {
           44  +integrity_check 1.5
           45  +do_test 1.6 {
    45     46     db close
    46     47     file exists test.db-hma
    47     48   } {0}
    48     49   
    49     50   
    50     51   #-------------------------------------------------------------------------
    51     52   #