Index: src/os_unix.c ================================================================== --- src/os_unix.c +++ src/os_unix.c @@ -3508,11 +3508,15 @@ case SQLITE_FCNTL_CHUNK_SIZE: { pFile->szChunk = *(int *)pArg; return SQLITE_OK; } case SQLITE_FCNTL_SIZE_HINT: { - return fcntlSizeHint(pFile, *(i64 *)pArg); + int rc; + SimulateIOErrorBenign(1); + rc = fcntlSizeHint(pFile, *(i64 *)pArg); + SimulateIOErrorBenign(0); + return rc; } case SQLITE_FCNTL_PERSIST_WAL: { int bPersist = *(int*)pArg; if( bPersist<0 ){ *(int*)pArg = (pFile->ctrlFlags & UNIXFILE_PERSIST_WAL)!=0; Index: src/pcache1.c ================================================================== --- src/pcache1.c +++ src/pcache1.c @@ -22,10 +22,13 @@ typedef struct PCache1 PCache1; typedef struct PgHdr1 PgHdr1; typedef struct PgFreeslot PgFreeslot; typedef struct PGroup PGroup; +typedef struct PGroupBlock PGroupBlock; +typedef struct PGroupBlockList PGroupBlockList; + /* Each page cache (or PCache) belongs to a PGroup. A PGroup is a set ** of one or more PCaches that are able to recycle each others unpinned ** pages when they are under memory pressure. A PGroup is an instance of ** the following object. ** @@ -51,11 +54,66 @@ int nMaxPage; /* Sum of nMax for purgeable caches */ int nMinPage; /* Sum of nMin for purgeable caches */ int mxPinned; /* nMaxpage + 10 - nMinPage */ int nCurrentPage; /* Number of purgeable pages allocated */ PgHdr1 *pLruHead, *pLruTail; /* LRU list of unpinned pages */ + PGroupBlockList *pBlockList; /* List of block-lists for this group */ +}; + +/* +** If SQLITE_PAGECACHE_BLOCKALLOC is defined when the library is built, +** each PGroup structure has a linked list of the the following starting +** at PGroup.pBlockList. There is one entry for each distinct page-size +** currently used by members of the PGroup (i.e. 1024 bytes, 4096 bytes +** etc.). Variable PGroupBlockList.nByte is set to the actual allocation +** size requested by each pcache, which is the database page-size plus +** the various header structures used by the pcache, pager and btree layers. +** Usually around (pgsz+200) bytes. +** +** This size (pgsz+200) bytes is not allocated efficiently by some +** implementations of malloc. In particular, some implementations are only +** able to allocate blocks of memory chunks of 2^N bytes, where N is some +** integer value. Since the page-size is a power of 2, this means we +** end up wasting (pgsz-200) bytes in each allocation. +** +** If SQLITE_PAGECACHE_BLOCKALLOC is defined, the (pgsz+200) byte blocks +** are not allocated directly. Instead, blocks of roughly M*(pgsz+200) bytes +** are requested from malloc allocator. After a block is returned, +** sqlite3MallocSize() is used to determine how many (pgsz+200) byte +** allocations can fit in the space returned by malloc(). This value may +** be more than M. +** +** The blocks are stored in a doubly-linked list. Variable PGroupBlock.nEntry +** contains the number of allocations that will fit in the aData[] space. +** nEntry is limited to the number of bits in bitmask mUsed. If a slot +** within aData is in use, the corresponding bit in mUsed is set. Thus +** when (mUsed+1==(1 << nEntry)) the block is completely full. +** +** Each time a slot within a block is freed, the block is moved to the start +** of the linked-list. And if a block becomes completely full, then it is +** moved to the end of the list. As a result, when searching for a free +** slot, only the first block in the list need be examined. If it is full, +** then it is guaranteed that all blocks are full. +*/ +struct PGroupBlockList { + int nByte; /* Size of each allocation in bytes */ + PGroupBlock *pFirst; /* First PGroupBlock in list */ + PGroupBlock *pLast; /* Last PGroupBlock in list */ + PGroupBlockList *pNext; /* Next block-list attached to group */ +}; + +struct PGroupBlock { + Bitmask mUsed; /* Mask of used slots */ + int nEntry; /* Maximum number of allocations in aData[] */ + u8 *aData; /* Pointer to data block */ + PGroupBlock *pNext; /* Next PGroupBlock in list */ + PGroupBlock *pPrev; /* Previous PGroupBlock in list */ + PGroupBlockList *pList; /* Owner list */ }; + +/* Minimum value for PGroupBlock.nEntry */ +#define PAGECACHE_BLOCKALLOC_MINENTRY 15 /* Each page cache is an instance of the following object. Every ** open database file (including each in-memory database and each ** temporary or transient database) has a single page cache which ** is an instance of this object. @@ -155,10 +213,21 @@ ** ** assert( PGHDR1_TO_PAGE(PAGE_TO_PGHDR1(pCache, X))==X ); */ #define PGHDR1_TO_PAGE(p) (void*)(((char*)p) - p->pCache->szPage) #define PAGE_TO_PGHDR1(c, p) (PgHdr1*)(((char*)p) + c->szPage) + +/* +** Blocks used by the SQLITE_PAGECACHE_BLOCKALLOC blocks to store/retrieve +** a PGroupBlock pointer based on a pointer to a page buffer. +*/ +#define PAGE_SET_BLOCKPTR(pCache, pPg, pBlock) \ + ( *(PGroupBlock **)&(((u8*)pPg)[sizeof(PgHdr1) + pCache->szPage]) = pBlock ) + +#define PAGE_GET_BLOCKPTR(pCache, pPg) \ + ( *(PGroupBlock **)&(((u8*)pPg)[sizeof(PgHdr1) + pCache->szPage]) ) + /* ** Macros to enter and leave the PCache LRU mutex. */ #define pcache1EnterMutex(X) sqlite3_mutex_enter((X)->mutex) @@ -280,18 +349,148 @@ sqlite3MemdebugSetType(p, MEMTYPE_PCACHE); return iSize; } } #endif /* SQLITE_ENABLE_MEMORY_MANAGEMENT */ + +/* +** The block pBlock belongs to list pList but is not currently linked in. +** Insert it into the start of the list. +*/ +static void addBlockToList(PGroupBlockList *pList, PGroupBlock *pBlock){ + pBlock->pPrev = 0; + pBlock->pNext = pList->pFirst; + pList->pFirst = pBlock; + if( pBlock->pNext ){ + pBlock->pNext->pPrev = pBlock; + }else{ + assert( pList->pLast==0 ); + pList->pLast = pBlock; + } +} + +/* +** If there are no blocks in the list headed by pList, remove pList +** from the pGroup->pBlockList list and free it with sqlite3_free(). +*/ +static void freeListIfEmpty(PGroup *pGroup, PGroupBlockList *pList){ + assert( sqlite3_mutex_held(pGroup->mutex) ); + if( pList->pFirst==0 ){ + PGroupBlockList **pp; + for(pp=&pGroup->pBlockList; *pp!=pList; pp=&(*pp)->pNext); + *pp = (*pp)->pNext; + sqlite3_free(pList); + } +} /* ** Allocate a new page object initially associated with cache pCache. */ static PgHdr1 *pcache1AllocPage(PCache1 *pCache){ int nByte = sizeof(PgHdr1) + pCache->szPage; - void *pPg = pcache1Alloc(nByte); + void *pPg = 0; PgHdr1 *p; + +#ifdef SQLITE_PAGECACHE_BLOCKALLOC + PGroup *pGroup = pCache->pGroup; + PGroupBlockList *pList; + PGroupBlock *pBlock; + int i; + + nByte += sizeof(PGroupBlockList *); + nByte = ROUND8(nByte); + + do{ + for(pList=pGroup->pBlockList; pList; pList=pList->pNext){ + if( pList->nByte==nByte ) break; + } + if( pList==0 ){ + PGroupBlockList *pNew; + pcache1LeaveMutex(pCache->pGroup); + pNew = (PGroupBlockList *)sqlite3MallocZero(sizeof(PGroupBlockList)); + pcache1EnterMutex(pCache->pGroup); + if( pNew==0 ){ + /* malloc() failure. Return early. */ + return 0; + } + for(pList=pGroup->pBlockList; pList; pList=pList->pNext){ + if( pList->nByte==nByte ) break; + } + if( pList ){ + sqlite3_free(pNew); + }else{ + pNew->nByte = nByte; + pNew->pNext = pGroup->pBlockList; + pGroup->pBlockList = pNew; + pList = pNew; + } + } + }while( pList==0 ); + + pBlock = pList->pFirst; + if( pBlock==0 || pBlock->mUsed==(((Bitmask)1<nEntry)-1) ){ + int sz; + + /* Allocate a new block. Try to allocate enough space for the PGroupBlock + ** structure and MINENTRY allocations of nByte bytes each. If the + ** allocator returns more memory than requested, then more than MINENTRY + ** allocations may fit in it. */ + pcache1LeaveMutex(pCache->pGroup); + sz = sizeof(PGroupBlock) + PAGECACHE_BLOCKALLOC_MINENTRY * nByte; + pBlock = (PGroupBlock *)sqlite3Malloc(sz); + pcache1EnterMutex(pCache->pGroup); + + if( !pBlock ){ + freeListIfEmpty(pGroup, pList); + return 0; + } + pBlock->nEntry = (sqlite3MallocSize(pBlock) - sizeof(PGroupBlock)) / nByte; + if( pBlock->nEntry>=BMS ){ + pBlock->nEntry = BMS-1; + } + pBlock->pList = pList; + pBlock->mUsed = 0; + pBlock->aData = (u8 *)&pBlock[1]; + addBlockToList(pList, pBlock); + + sz = sqlite3MallocSize(pBlock); + sqlite3_mutex_enter(pcache1.mutex); + sqlite3StatusAdd(SQLITE_STATUS_PAGECACHE_OVERFLOW, sz); + sqlite3_mutex_leave(pcache1.mutex); + } + + for(i=0; pPg==0 && ALWAYS(inEntry); i++){ + if( 0==(pBlock->mUsed & ((Bitmask)1<mUsed |= ((Bitmask)1<aData[pList->nByte * i]; + } + } + assert( pPg ); + PAGE_SET_BLOCKPTR(pCache, pPg, pBlock); + + /* If the block is now full, shift it to the end of the list */ + if( pBlock->mUsed==(((Bitmask)1<nEntry)-1) && pList->pLast!=pBlock ){ + assert( pList->pFirst==pBlock ); + assert( pBlock->pPrev==0 ); + assert( pList->pLast->pNext==0 ); + pList->pFirst = pBlock->pNext; + pList->pFirst->pPrev = 0; + pBlock->pPrev = pList->pLast; + pBlock->pNext = 0; + pList->pLast->pNext = pBlock; + pList->pLast = pBlock; + } +#else + /* The group mutex must be released before pcache1Alloc() is called. This + ** is because it may call sqlite3_release_memory(), which assumes that + ** this mutex is not held. */ + assert( sqlite3_mutex_held(pCache->pGroup->mutex) ); + pcache1LeaveMutex(pCache->pGroup); + pPg = pcache1Alloc(nByte); + pcache1EnterMutex(pCache->pGroup); +#endif + if( pPg ){ p = PAGE_TO_PGHDR1(pCache, pPg); if( pCache->bPurgeable ){ pCache->pGroup->nCurrentPage++; } @@ -309,14 +508,56 @@ ** with a NULL pointer, so we mark the NULL test with ALWAYS(). */ static void pcache1FreePage(PgHdr1 *p){ if( ALWAYS(p) ){ PCache1 *pCache = p->pCache; + void *pPg = PGHDR1_TO_PAGE(p); + +#ifdef SQLITE_PAGECACHE_BLOCKALLOC + PGroupBlock *pBlock = PAGE_GET_BLOCKPTR(pCache, pPg); + PGroupBlockList *pList = pBlock->pList; + int i = ((u8 *)pPg - pBlock->aData) / pList->nByte; + + assert( pPg==(void *)&pBlock->aData[i*pList->nByte] ); + assert( pBlock->mUsed & ((Bitmask)1<mUsed &= ~((Bitmask)1<pFirst==pBlock ){ + pList->pFirst = pBlock->pNext; + if( pList->pFirst ) pList->pFirst->pPrev = 0; + }else{ + pBlock->pPrev->pNext = pBlock->pNext; + } + if( pList->pLast==pBlock ){ + pList->pLast = pBlock->pPrev; + if( pList->pLast ) pList->pLast->pNext = 0; + }else{ + pBlock->pNext->pPrev = pBlock->pPrev; + } + + if( pBlock->mUsed==0 ){ + PGroup *pGroup = p->pCache->pGroup; + + int sz = sqlite3MallocSize(pBlock); + sqlite3_mutex_enter(pcache1.mutex); + sqlite3StatusAdd(SQLITE_STATUS_PAGECACHE_OVERFLOW, -sz); + sqlite3_mutex_leave(pcache1.mutex); + freeListIfEmpty(pGroup, pList); + sqlite3_free(pBlock); + }else{ + addBlockToList(pList, pBlock); + } +#else + assert( sqlite3_mutex_held(p->pCache->pGroup->mutex) ); + pcache1Free(pPg); +#endif if( pCache->bPurgeable ){ pCache->pGroup->nCurrentPage--; } - pcache1Free(PGHDR1_TO_PAGE(p)); } } /* ** Malloc function used by SQLite to obtain space from the buffer configured @@ -750,13 +991,11 @@ /* Step 5. If a usable page buffer has still not been found, ** attempt to allocate a new one. */ if( !pPage ){ if( createFlag==1 ) sqlite3BeginBenignMalloc(); - pcache1LeaveMutex(pGroup); pPage = pcache1AllocPage(pCache); - pcache1EnterMutex(pGroup); if( createFlag==1 ) sqlite3EndBenignMalloc(); } if( pPage ){ unsigned int h = iKey % pCache->nHash; Index: src/test_config.c ================================================================== --- src/test_config.c +++ src/test_config.c @@ -552,10 +552,16 @@ #ifdef YYTRACKMAXSTACKDEPTH Tcl_SetVar2(interp, "sqlite_options", "yytrackmaxstackdepth", "1", TCL_GLOBAL_ONLY); #else Tcl_SetVar2(interp, "sqlite_options", "yytrackmaxstackdepth", "0", TCL_GLOBAL_ONLY); #endif + +#ifdef SQLITE_PAGECACHE_BLOCKALLOC + Tcl_SetVar2(interp, "sqlite_options", "blockalloc", "1", TCL_GLOBAL_ONLY); +#else + Tcl_SetVar2(interp, "sqlite_options", "blockalloc", "0", TCL_GLOBAL_ONLY); +#endif #define LINKVAR(x) { \ static const int cv_ ## x = SQLITE_ ## x; \ Tcl_LinkVar(interp, "SQLITE_" #x, (char *)&(cv_ ## x), \ TCL_LINK_INT | TCL_LINK_READ_ONLY); } Index: test/memdb.test ================================================================== --- test/memdb.test +++ test/memdb.test @@ -405,11 +405,11 @@ } } 0 # Test that auto-vacuum works with in-memory databases. # -ifcapable autovacuum { +ifcapable autovacuum&&!blockalloc { do_test memdb-9.1 { db close sqlite3 db test.db db cache size 0 execsql { Index: test/memsubsys1.test ================================================================== --- test/memsubsys1.test +++ test/memsubsys1.test @@ -22,10 +22,17 @@ # if {[permutation] == "memsubsys1"} { finish_test return } + +# Nor will it work if the pager is allocating memory in blocks. +# +ifcapable blockalloc { + finish_test + return +} # This procedure constructs a new database in test.db. It fills # this database with many small records (enough to force multiple # rebalance operations in the btree-layer and to require a large # page cache), verifies correct results, then returns. Index: test/pcache2.test ================================================================== --- test/pcache2.test +++ test/pcache2.test @@ -14,10 +14,17 @@ # $Id: pcache2.test,v 1.5 2009/07/18 14:36:24 danielk1977 Exp $ set testdir [file dirname $argv0] source $testdir/tester.tcl +# If compiled with blockalloc, pagecache memory is not used. Which +# causes these tests to fail. +# +ifcapable blockalloc { + finish_test + return +} # Set up a pcache memory pool so that we can easily track how many # pages are being used for cache. # do_test pcache2-1.1 {