SQLite

Check-in [9b7f80246f]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Avoid writer starvation by adding a RESERVED state to page locks.
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | server-edition
Files: files | file ages | folders
SHA3-256: 9b7f80246f2b9921483ab23457865e783ee70b93f67bcecc0c16516447a05875
User & Date: dan 2017-05-15 19:32:58.633
Context
2017-05-22
08:01
Add code to this branch to emit a log message after each cumulative second that the WRITER lock has been held. (check-in: a726d98122 user: dan tags: server-edition)
2017-05-15
19:32
Avoid writer starvation by adding a RESERVED state to page locks. (check-in: 9b7f80246f user: dan tags: server-edition)
2017-05-13
19:07
Avoid running recovery while there is another read/write client. (check-in: a38858a24c user: dan tags: server-edition)
Changes
Unified Diff Ignore Whitespace Patch
Changes to src/server.c.
20
21
22
23
24
25
26
27
28
29






30


31

32


33
34







35
36
37
38
39
40
41
42
43
44
45
46
47
48

49
50
51
52
53
54
55
**
**   16*4 bytes - locking slots. Connections hold a read-lock on a locking slot
**                when they are connected, a write lock when they have an open
**                transaction.
**
**    N*4 bytes - Page locking slots. N is HMA_PAGELOCK_SLOTS.
**
** Page lock slot format:
**
**    Least significant HMA_CLIENT_SLOTS used for read-locks. If bit 0 is set,






**    client 0 holds a read-lock.


**

**    If (v) is the value of the locking slot and (v>>HMA_CLIENT_SLOTS) is


**    not zero, then the write-lock holder is client ((v>>HMA_CLIENT_SLOTS)-1).
**







*/

#ifdef SQLITE_SERVER_EDITION

#define HMA_CLIENT_SLOTS   16
#define HMA_PAGELOCK_SLOTS (256*1024)

#define HMA_FILE_SIZE (4 + 4*HMA_CLIENT_SLOTS + 4*HMA_PAGELOCK_SLOTS)

#include "unistd.h"
#include "fcntl.h"
#include "sys/mman.h"
#include "sys/types.h"
#include "sys/stat.h"


typedef struct ServerHMA ServerHMA;

struct ServerGlobal {
  ServerHMA *pHma;                /* Linked list of all ServerHMA objects */
};
static struct ServerGlobal g_server;







|

|
>
>
>
>
>
>
|
>
>

>
|
>
>
|

>
>
>
>
>
>
>














>







20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
**
**   16*4 bytes - locking slots. Connections hold a read-lock on a locking slot
**                when they are connected, a write lock when they have an open
**                transaction.
**
**    N*4 bytes - Page locking slots. N is HMA_PAGELOCK_SLOTS.
**
** Page-locking slot format:
**
**   Each page-locking slot provides SHARED/RESERVED/EXCLUSIVE locks on a
**   single page. A RESERVED lock is similar to a RESERVED in SQLite's
**   rollback mode - existing SHARED locks may continue but new SHARED locks
**   may not be established. As in rollback mode, EXCLUSIVE and RESERVED 
**   locks are mutually exclusive.
**
**   Each 32-bit locking slot is divided into two sections - a bitmask for
**   read-locks and a single integer field for the write lock. The bitmask
**   occupies the least-significant 27 bits of the slot. The integer field
**   occupies the remaining 5 bits (so that it can store values from 0-31).
**
**   Each client has a unique integer client id. Currently these range from
**   0-15 (maximum of 16 concurrent connections). The page-locking slot format
**   allows this to be increased to 0-26 (maximum of 26 connections). To
**   take a SHARED lock, the corresponding bit is set in the locking slot
**   bitmask:
**
**     slot = slot | (1 << iClient);
**
**   To take an EXCLUSIVE or RESERVED lock, the integer part of the locking
**   slot is set to the client-id of the locker plus one (a value of zero 
**   indicates that no connection holds a RESERVED or EXCLUSIVE lock):
**
**     slot = slot | ((iClient+1) << 27)
*/

#ifdef SQLITE_SERVER_EDITION

#define HMA_CLIENT_SLOTS   16
#define HMA_PAGELOCK_SLOTS (256*1024)

#define HMA_FILE_SIZE (4 + 4*HMA_CLIENT_SLOTS + 4*HMA_PAGELOCK_SLOTS)

#include "unistd.h"
#include "fcntl.h"
#include "sys/mman.h"
#include "sys/types.h"
#include "sys/stat.h"
#include "errno.h"

typedef struct ServerHMA ServerHMA;

struct ServerGlobal {
  ServerHMA *pHma;                /* Linked list of all ServerHMA objects */
};
static struct ServerGlobal g_server;
108
109
110
111
112
113
114



115
116
117
118
119
120
121
  memset(&l, 0, sizeof(l));
  l.l_type = aType[eLock];
  l.l_whence = SEEK_SET;
  l.l_start = iSlot*sizeof(u32);
  l.l_len = 1;

  res = fcntl(fd, (bBlock ? F_SETLKW : F_SETLK), &l);



  return (res==0 ? SQLITE_OK : SQLITE_BUSY);
}

static int serverMapFile(ServerHMA *p){
  assert( p->aMap==0 );
  p->aMap = mmap(0, HMA_FILE_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, p->fd, 0);
  if( p->aMap==0 ){







>
>
>







127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
  memset(&l, 0, sizeof(l));
  l.l_type = aType[eLock];
  l.l_whence = SEEK_SET;
  l.l_start = iSlot*sizeof(u32);
  l.l_len = 1;

  res = fcntl(fd, (bBlock ? F_SETLKW : F_SETLK), &l);
  if( res && bBlock && errno==EDEADLK ){
    return SQLITE_BUSY_DEADLOCK;
  }
  return (res==0 ? SQLITE_OK : SQLITE_BUSY);
}

static int serverMapFile(ServerHMA *p){
  assert( p->aMap==0 );
  p->aMap = mmap(0, HMA_FILE_SIZE, PROT_READ|PROT_WRITE, MAP_SHARED, p->fd, 0);
  if( p->aMap==0 ){
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
  int bBlock,                     /* If true, block for this lock */
  u32 v,                          /* Value of blocking page locking slot */
  int *pbRetry                    /* OUT: True if caller should retry lock */
){
  int rc = SQLITE_OK;
  int iBlock = ((int)(v>>HMA_CLIENT_SLOTS))-1;

  if( iBlock<0 ){
    for(iBlock=0; iBlock<HMA_CLIENT_SLOTS; iBlock++){
      if( iBlock!=p->iClient && (v & (1<<iBlock)) ) break;
    }
  }
  assert( iBlock<HMA_CLIENT_SLOTS );

  serverEnterMutex();







|







384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
  int bBlock,                     /* If true, block for this lock */
  u32 v,                          /* Value of blocking page locking slot */
  int *pbRetry                    /* OUT: True if caller should retry lock */
){
  int rc = SQLITE_OK;
  int iBlock = ((int)(v>>HMA_CLIENT_SLOTS))-1;

  if( iBlock<0 || iBlock==p->iClient ){
    for(iBlock=0; iBlock<HMA_CLIENT_SLOTS; iBlock++){
      if( iBlock!=p->iClient && (v & (1<<iBlock)) ) break;
    }
  }
  assert( iBlock<HMA_CLIENT_SLOTS );

  serverEnterMutex();
443
444
445
446
447
448
449









450
451
452
453
454
455
456
457
458


459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486

487
488


489
490
491













492
493
494
495
496
497
498
499

500
501
502
503
504
505
506
507
508
509
510
511
512











513
514
515
516
517
518
519
520
521
522
523
524
/*
** Release all write-locks.
*/
int sqlite3ServerReleaseWriteLocks(Server *p){
  int rc = SQLITE_OK;
  return rc;
}










/*
** Lock page pgno for reading (bWrite==0) or writing (bWrite==1).
**
** If parameter bBlock is non-zero, then make this a blocking lock if
** possible.
*/
int sqlite3ServerLock(Server *p, Pgno pgno, int bWrite, int bBlock){
  int rc = SQLITE_OK;



  /* Grow the aLock[] array, if required */
  if( p->nLock==p->nAlloc ){
    int nNew = p->nAlloc ? p->nAlloc*2 : 128;
    u32 *aNew;
    aNew = (u32*)sqlite3_realloc(p->aLock, sizeof(u32)*nNew);
    if( aNew==0 ){
      rc = SQLITE_NOMEM_BKPT;
    }else{
      p->aLock = aNew;
      p->nAlloc = nNew;
    }
  }
  if( rc==SQLITE_OK ){
    u32 *pSlot = serverPageLockSlot(p, pgno);
    u32 v = *pSlot;

    /* Check if the required lock is already held. If so, exit this function
    ** early. Otherwise, add an entry to the aLock[] array to record the fact
    ** that the lock may need to be released.  */
    if( bWrite ){
      int iLock = ((int)(v>>HMA_CLIENT_SLOTS)) - 1;
      if( iLock==p->iClient ) goto server_lock_out;
    }else{
      if( v & (1<<p->iClient) ) goto server_lock_out;
    }

    p->aLock[p->nLock++] = pgno;

    while( 1 ){
      u32 n;



      while( (bWrite && (v & ~(1 << p->iClient))) || (v >> HMA_CLIENT_SLOTS) ){
        int bRetry = 0;













        rc = serverOvercomeLock(p, bWrite, bBlock, v, &bRetry);
        if( rc!=SQLITE_OK ) goto server_lock_out;
        if( bRetry==0 ){
          /* There is a conflicting lock. Cannot obtain this lock. */
          sqlite3_log(SQLITE_BUSY_DEADLOCK, "Conflict at page %d", (int)pgno);
          rc = SQLITE_BUSY_DEADLOCK;
          goto server_lock_out;
        }

        v = *pSlot;
      }

      n = v | (1 << p->iClient);
      if( bWrite ){
        n = n | ((p->iClient+1) << HMA_CLIENT_SLOTS);
      }
      if( __sync_val_compare_and_swap(pSlot, v, n)==v ) break;
      v = *pSlot;
    }
  }

server_lock_out:











  return rc;
}

int sqlite3ServerHasLock(Server *p, Pgno pgno, int bWrite){
  u32 v = *serverPageLockSlot(p, pgno);
  if( bWrite ){
    return (v>>HMA_CLIENT_SLOTS)==(p->iClient+1);
  }
  return (v & (1 << p->iClient))!=0;
}

#endif /* ifdef SQLITE_SERVER_EDITION */







>
>
>
>
>
>
>
>
>









>
>














<











<

>


>
>

|

>
>
>
>
>
>
>
>
>
>
>
>
>








>













>
>
>
>
>
>
>
>
>
>
>












465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505

506
507
508
509
510
511
512
513
514
515
516

517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
/*
** Release all write-locks.
*/
int sqlite3ServerReleaseWriteLocks(Server *p){
  int rc = SQLITE_OK;
  return rc;
}

/*
** Return the client id of the client that currently holds the EXCLUSIVE
** or RESERVED lock according to page-locking slot value v. Or -1 if no
** client holds such a lock.
*/
int serverWriteLocker(u32 v){
  return ((int)(v >> HMA_CLIENT_SLOTS)) - 1;
}

/*
** Lock page pgno for reading (bWrite==0) or writing (bWrite==1).
**
** If parameter bBlock is non-zero, then make this a blocking lock if
** possible.
*/
int sqlite3ServerLock(Server *p, Pgno pgno, int bWrite, int bBlock){
  int rc = SQLITE_OK;
  int bReserved = 0;
  u32 *pSlot = serverPageLockSlot(p, pgno);

  /* Grow the aLock[] array, if required */
  if( p->nLock==p->nAlloc ){
    int nNew = p->nAlloc ? p->nAlloc*2 : 128;
    u32 *aNew;
    aNew = (u32*)sqlite3_realloc(p->aLock, sizeof(u32)*nNew);
    if( aNew==0 ){
      rc = SQLITE_NOMEM_BKPT;
    }else{
      p->aLock = aNew;
      p->nAlloc = nNew;
    }
  }
  if( rc==SQLITE_OK ){

    u32 v = *pSlot;

    /* Check if the required lock is already held. If so, exit this function
    ** early. Otherwise, add an entry to the aLock[] array to record the fact
    ** that the lock may need to be released.  */
    if( bWrite ){
      int iLock = ((int)(v>>HMA_CLIENT_SLOTS)) - 1;
      if( iLock==p->iClient ) goto server_lock_out;
    }else{
      if( v & (1<<p->iClient) ) goto server_lock_out;
    }

    p->aLock[p->nLock++] = pgno;

    while( 1 ){
      u32 n;
      int w;
      u32 mask = (bWrite ? (((1<<HMA_CLIENT_SLOTS)-1) & ~(1<<p->iClient)) : 0);

      while( ((w = serverWriteLocker(v))>=0 && w!=p->iClient) || (v & mask) ){
        int bRetry = 0;

        if( w<0 && bWrite && bBlock ){
          /* Attempt a RESERVED lock before anything else */
          n = v | ((p->iClient+1) << HMA_CLIENT_SLOTS);
          assert( serverWriteLocker(n)==p->iClient );
          if( __sync_val_compare_and_swap(pSlot, v, n)!=v ){
            v = *pSlot;
            continue;
          }
          v = n;
          bReserved = 1;
        }

        rc = serverOvercomeLock(p, bWrite, bBlock, v, &bRetry);
        if( rc!=SQLITE_OK ) goto server_lock_out;
        if( bRetry==0 ){
          /* There is a conflicting lock. Cannot obtain this lock. */
          sqlite3_log(SQLITE_BUSY_DEADLOCK, "Conflict at page %d", (int)pgno);
          rc = SQLITE_BUSY_DEADLOCK;
          goto server_lock_out;
        }

        v = *pSlot;
      }

      n = v | (1 << p->iClient);
      if( bWrite ){
        n = n | ((p->iClient+1) << HMA_CLIENT_SLOTS);
      }
      if( __sync_val_compare_and_swap(pSlot, v, n)==v ) break;
      v = *pSlot;
    }
  }

server_lock_out:
  if( rc!=SQLITE_OK && bReserved ){
    u32 n;
    u32 v;
    do{
      v = *pSlot;
      assert( serverWriteLocker(v)==p->iClient );
      n = v & ((1<<HMA_CLIENT_SLOTS)-1);
    }while( __sync_val_compare_and_swap(pSlot, v, n)!=v );
  }

  assert( rc!=SQLITE_OK || sqlite3ServerHasLock(p, pgno, bWrite) );
  return rc;
}

int sqlite3ServerHasLock(Server *p, Pgno pgno, int bWrite){
  u32 v = *serverPageLockSlot(p, pgno);
  if( bWrite ){
    return (v>>HMA_CLIENT_SLOTS)==(p->iClient+1);
  }
  return (v & (1 << p->iClient))!=0;
}

#endif /* ifdef SQLITE_SERVER_EDITION */
Changes to src/wal.c.
3118
3119
3120
3121
3122
3123
3124

3125
3126
3127
3128
3129
3130
3131
    if( rc==SQLITE_OK ){
      pWal->writeLock = 1;
      rc = walIndexTryHdr(pWal, &bDummy);
    }
    if( rc!=SQLITE_OK ){
      return rc;
    }

  }
  assert( walIsServer(pWal)==0 || sqlite3ServerHasLock(pWal->pServer, 0, 1) );

  /* If this frame set completes a transaction, then nTruncate>0.  If
  ** nTruncate==0 then this frame set does not complete the transaction. */
  assert( (isCommit!=0)==(nTruncate!=0) );








>







3118
3119
3120
3121
3122
3123
3124
3125
3126
3127
3128
3129
3130
3131
3132
    if( rc==SQLITE_OK ){
      pWal->writeLock = 1;
      rc = walIndexTryHdr(pWal, &bDummy);
    }
    if( rc!=SQLITE_OK ){
      return rc;
    }
    assert( sqlite3ServerHasLock(pWal->pServer, 0, 1) );
  }
  assert( walIsServer(pWal)==0 || sqlite3ServerHasLock(pWal->pServer, 0, 1) );

  /* If this frame set completes a transaction, then nTruncate>0.  If
  ** nTruncate==0 then this frame set does not complete the transaction. */
  assert( (isCommit!=0)==(nTruncate!=0) );

3393
3394
3395
3396
3397
3398
3399


3400
3401
3402
3403
3404
3405
3406
  ** EVIDENCE-OF: R-60642-04082 If the writer lock cannot be obtained
  ** immediately, and a busy-handler is configured, it is invoked and the
  ** writer lock retried until either the busy-handler returns 0 or the
  ** lock is successfully obtained.
  */
  if( eMode!=SQLITE_CHECKPOINT_PASSIVE ){
    if( walIsServer(pWal) ){


      if( eMode>=SQLITE_CHECKPOINT_RESTART ){
        /* Exclusive lock on page 1. This is exclusive access to the db. */
        rc = sqlite3ServerLock(pWal->pServer, 1, 1, 1);
      }else{
        /* Take the server write-lock ("page" 0) */
        rc = sqlite3ServerLock(pWal->pServer, 0, 1, 1);
      }







>
>







3394
3395
3396
3397
3398
3399
3400
3401
3402
3403
3404
3405
3406
3407
3408
3409
  ** EVIDENCE-OF: R-60642-04082 If the writer lock cannot be obtained
  ** immediately, and a busy-handler is configured, it is invoked and the
  ** writer lock retried until either the busy-handler returns 0 or the
  ** lock is successfully obtained.
  */
  if( eMode!=SQLITE_CHECKPOINT_PASSIVE ){
    if( walIsServer(pWal) ){
      rc = sqlite3ServerBegin(pWal->pServer);
      if( rc!=SQLITE_OK ) goto ckpt_out;
      if( eMode>=SQLITE_CHECKPOINT_RESTART ){
        /* Exclusive lock on page 1. This is exclusive access to the db. */
        rc = sqlite3ServerLock(pWal->pServer, 1, 1, 1);
      }else{
        /* Take the server write-lock ("page" 0) */
        rc = sqlite3ServerLock(pWal->pServer, 0, 1, 1);
      }
3447
3448
3449
3450
3451
3452
3453

3454
3455
3456
3457
3458
3459
3460
    ** next time the pager opens a snapshot on this database it knows that
    ** the cache needs to be reset.
    */
    memset(&pWal->hdr, 0, sizeof(WalIndexHdr));
  }

  /* Release the locks. */

  sqlite3WalEndWriteTransaction(pWal);
  walUnlockExclusive(pWal, WAL_CKPT_LOCK, 1);
  pWal->ckptLock = 0;
  WALTRACE(("WAL%p: checkpoint %s\n", pWal, rc ? "failed" : "ok"));
  if( walIsServer(pWal) ) sqlite3ServerEnd(pWal->pServer);
  return (rc==SQLITE_OK && eMode!=eMode2 ? SQLITE_BUSY : rc);
}







>







3450
3451
3452
3453
3454
3455
3456
3457
3458
3459
3460
3461
3462
3463
3464
    ** next time the pager opens a snapshot on this database it knows that
    ** the cache needs to be reset.
    */
    memset(&pWal->hdr, 0, sizeof(WalIndexHdr));
  }

  /* Release the locks. */
 ckpt_out:
  sqlite3WalEndWriteTransaction(pWal);
  walUnlockExclusive(pWal, WAL_CKPT_LOCK, 1);
  pWal->ckptLock = 0;
  WALTRACE(("WAL%p: checkpoint %s\n", pWal, rc ? "failed" : "ok"));
  if( walIsServer(pWal) ) sqlite3ServerEnd(pWal->pServer);
  return (rc==SQLITE_OK && eMode!=eMode2 ? SQLITE_BUSY : rc);
}