/ Check-in [1ebe5fc7]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:On unix, embargo close() operations until all locks have cleared from the file. Ticket #561. (CVS 1171)
Downloads: Tarball | ZIP archive | SQL archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: 1ebe5fc7b03a6b070a5d52ffedb95f0d519ab068
User & Date: drh 2004-01-12 00:39:06
Context
2004-01-14
03:12
Remove an incomplete comment that somehow snuck into the sources. (CVS 1172) check-in: ea1ad465 user: drh tags: trunk
2004-01-12
00:39
On unix, embargo close() operations until all locks have cleared from the file. Ticket #561. (CVS 1171) check-in: 1ebe5fc7 user: drh tags: trunk
00:38
Previous commit of changes to the in-memory backend was not quite right. This check-in should square things away. (CVS 1170) check-in: 75d91e3b user: drh tags: trunk
Changes
Hide Diffs Side-by-Side Diffs Ignore Whitespace Patch

Changes to src/os.c.

   160    160   ** file is unlocked.  cnt==-1 means the file has an exclusive lock.
   161    161   ** cnt>0 means there are cnt shared locks on the file.
   162    162   **
   163    163   ** Any attempt to lock or unlock a file first checks the locking
   164    164   ** structure.  The fcntl() system call is only invoked to set a 
   165    165   ** POSIX lock if the internal lock structure transitions between
   166    166   ** a locked and an unlocked state.
          167  +**
          168  +** 2004-Jan-11:
          169  +** More recent discoveries about POSIX advisory locks.  (The more
          170  +** I discover, the more I realize the a POSIX advisory locks are
          171  +** an abomination.)
          172  +**
          173  +** If you close a file descriptor that points to a file that has locks,
          174  +** all locks on that file that are owned by the current process are
          175  +** released.  To work around this problem, each OsFile structure contains
          176  +** a pointer to an openCnt structure.  There is one openCnt structure
          177  +** per open inode, which means that multiple OsFiles can point to a single
          178  +** openCnt.  When an attempt is made to close an OsFile, if there are
          179  +** other OsFiles open on the same inode that are holding locks, the call
          180  +** to close() the file descriptor is deferred until all of the locks clear.
          181  +** The openCnt structure keeps a list of file descriptors that need to
          182  +** be closed and that list is walked (and cleared) when the last lock
          183  +** clears.
          184  +**
          185  +** First, under Linux threads, because each thread has a separate
          186  +** process ID, lock operations in one thread do not override locks
          187  +** to the same file in other threads.  Linux threads behave like
          188  +** separate processes in this respect.  But, if you close a file
          189  +** descriptor in linux threads, all locks are cleared, even locks
          190  +** on other threads and even though the other threads have different
          191  +** process IDs.  Linux threads is inconsistent in this respect.
          192  +** (I'm beginning to think that linux threads is an abomination too.)
          193  +** The consequence of this all is that the hash table for the lockInfo
          194  +** structure has to include the process id as part of its key because
          195  +** locks in different threads are treated as distinct.  But the 
          196  +** openCnt structure should not include the process id in its
          197  +** key because close() clears lock on all threads, not just the current
          198  +** thread.  Were it not for this goofiness in linux threads, we could
          199  +** combine the lockInfo and openCnt structures into a single structure.
   167    200   */
   168    201   
   169    202   /*
   170    203   ** An instance of the following structure serves as the key used
   171    204   ** to locate a particular lockInfo structure given its inode.  Note
   172    205   ** that we have to include the process ID as part of the key.  On some
   173    206   ** threading implementations (ex: linux), each thread has a separate
................................................................................
   176    209   struct lockKey {
   177    210     dev_t dev;   /* Device number */
   178    211     ino_t ino;   /* Inode number */
   179    212     pid_t pid;   /* Process ID */
   180    213   };
   181    214   
   182    215   /*
   183         -** An instance of the following structure is allocated for each inode.
          216  +** An instance of the following structure is allocated for each open
          217  +** inode on each thread with a different process ID.  (Threads have
          218  +** different process IDs on linux, but not on most other unixes.)
          219  +**
   184    220   ** A single inode can have multiple file descriptors, so each OsFile
   185    221   ** structure contains a pointer to an instance of this object and this
   186    222   ** object keeps a count of the number of OsFiles pointing to it.
   187    223   */
   188    224   struct lockInfo {
   189    225     struct lockKey key;  /* The lookup key */
   190         -  int cnt;              /* 0: unlocked.  -1: write lock.  1...: read lock. */
          226  +  int cnt;             /* 0: unlocked.  -1: write lock.  1...: read lock. */
          227  +  int nRef;            /* Number of pointers to this structure */
          228  +};
          229  +
          230  +/*
          231  +** An instance of the following structure serves as the key used
          232  +** to locate a particular openCnt structure given its inode.  This
          233  +** is the same as the lockKey except that the process ID is omitted.
          234  +*/
          235  +struct openKey {
          236  +  dev_t dev;   /* Device number */
          237  +  ino_t ino;   /* Inode number */
          238  +};
          239  +
          240  +/*
          241  +** An instance of the following structure is allocated for each open
          242  +** inode.  This structure keeps track of the number of locks on that
          243  +** inode.  If a close is attempted against an inode that is holding
          244  +** locks, the close is deferred until all locks clear by adding the
          245  +** file descriptor to be closed to the pending list.
          246  +*/
          247  +struct openCnt {
          248  +  struct openKey key;   /* The lookup key */
   191    249     int nRef;             /* Number of pointers to this structure */
          250  +  int nLock;            /* Number of outstanding locks */
          251  +  int nPending;         /* Number of pending close() operations */
          252  +  int *aPending;        /* Malloced space holding fd's awaiting a close() */
   192    253   };
   193    254   
   194    255   /* 
   195         -** This hash table maps inodes (in the form of lockKey structures) into
   196         -** pointers to lockInfo structures.
          256  +** These hash table maps inodes and process IDs into lockInfo and openCnt
          257  +** structures.  Access to these hash tables must be protected by a mutex.
   197    258   */
   198    259   static Hash lockHash = { SQLITE_HASH_BINARY, 0, 0, 0, 0, 0 };
   199         -
   200         -/*
   201         -** Given a file descriptor, locate a lockInfo structure that describes
   202         -** that file descriptor.  Create a new one if necessary.  NULL might
   203         -** be returned if malloc() fails.
   204         -*/
   205         -static struct lockInfo *findLockInfo(int fd){
   206         -  int rc;
   207         -  struct lockKey key;
   208         -  struct stat statbuf;
   209         -  struct lockInfo *pInfo;
   210         -  rc = fstat(fd, &statbuf);
   211         -  if( rc!=0 ) return 0;
   212         -  memset(&key, 0, sizeof(key));
   213         -  key.dev = statbuf.st_dev;
   214         -  key.ino = statbuf.st_ino;
   215         -  key.pid = getpid();
   216         -  pInfo = (struct lockInfo*)sqliteHashFind(&lockHash, &key, sizeof(key));
   217         -  if( pInfo==0 ){
   218         -    struct lockInfo *pOld;
   219         -    pInfo = sqliteMalloc( sizeof(*pInfo) );
   220         -    if( pInfo==0 ) return 0;
   221         -    pInfo->key = key;
   222         -    pInfo->nRef = 1;
   223         -    pInfo->cnt = 0;
   224         -    pOld = sqliteHashInsert(&lockHash, &pInfo->key, sizeof(key), pInfo);
   225         -    if( pOld!=0 ){
   226         -      assert( pOld==pInfo );
   227         -      sqliteFree(pInfo);
   228         -      pInfo = 0;
   229         -    }
   230         -  }else{
   231         -    pInfo->nRef++;
   232         -  }
   233         -  return pInfo;
   234         -}
          260  +static Hash openHash = { SQLITE_HASH_BINARY, 0, 0, 0, 0, 0 };
   235    261   
   236    262   /*
   237    263   ** Release a lockInfo structure previously allocated by findLockInfo().
   238    264   */
   239         -static void releaseLockInfo(struct lockInfo *pInfo){
   240         -  pInfo->nRef--;
   241         -  if( pInfo->nRef==0 ){
   242         -    sqliteHashInsert(&lockHash, &pInfo->key, sizeof(pInfo->key), 0);
   243         -    sqliteFree(pInfo);
   244         -  }
   245         -}
          265  +static void releaseLockInfo(struct lockInfo *pLock){
          266  +  pLock->nRef--;
          267  +  if( pLock->nRef==0 ){
          268  +    sqliteHashInsert(&lockHash, &pLock->key, sizeof(pLock->key), 0);
          269  +    sqliteFree(pLock);
          270  +  }
          271  +}
          272  +
          273  +/*
          274  +** Release a openCnt structure previously allocated by findLockInfo().
          275  +*/
          276  +static void releaseOpenCnt(struct openCnt *pOpen){
          277  +  pOpen->nRef--;
          278  +  if( pOpen->nRef==0 ){
          279  +    sqliteHashInsert(&openHash, &pOpen->key, sizeof(pOpen->key), 0);
          280  +    sqliteFree(pOpen->aPending);
          281  +    sqliteFree(pOpen);
          282  +  }
          283  +}
          284  +
          285  +/*
          286  +** Given a file descriptor, locate lockInfo and openCnt structures that
          287  +** describes that file descriptor.  Create a new ones if necessary.  The
          288  +** return values might be unset if an error occurs.
          289  +**
          290  +** Return the number of errors.
          291  +*/
          292  +int findLockInfo(
          293  +  int fd,                      /* The file descriptor used in the key */
          294  +  struct lockInfo **ppLock,    /* Return the lockInfo structure here */
          295  +  struct openCnt **ppOpen   /* Return the openCnt structure here */
          296  +){
          297  +  int rc;
          298  +  struct lockKey key1;
          299  +  struct openKey key2;
          300  +  struct stat statbuf;
          301  +  struct lockInfo *pLock;
          302  +  struct openCnt *pOpen;
          303  +  rc = fstat(fd, &statbuf);
          304  +  if( rc!=0 ) return 1;
          305  +  memset(&key1, 0, sizeof(key1));
          306  +  key1.dev = statbuf.st_dev;
          307  +  key1.ino = statbuf.st_ino;
          308  +  key1.pid = getpid();
          309  +  memset(&key2, 0, sizeof(key2));
          310  +  key2.dev = statbuf.st_dev;
          311  +  key2.ino = statbuf.st_ino;
          312  +  pLock = (struct lockInfo*)sqliteHashFind(&lockHash, &key1, sizeof(key1));
          313  +  if( pLock==0 ){
          314  +    struct lockInfo *pOld;
          315  +    pLock = sqliteMallocRaw( sizeof(*pLock) );
          316  +    if( pLock==0 ) return 1;
          317  +    pLock->key = key1;
          318  +    pLock->nRef = 1;
          319  +    pLock->cnt = 0;
          320  +    pOld = sqliteHashInsert(&lockHash, &pLock->key, sizeof(key1), pLock);
          321  +    if( pOld!=0 ){
          322  +      assert( pOld==pLock );
          323  +      sqliteFree(pLock);
          324  +      return 1;
          325  +    }
          326  +  }else{
          327  +    pLock->nRef++;
          328  +  }
          329  +  *ppLock = pLock;
          330  +  pOpen = (struct openCnt*)sqliteHashFind(&openHash, &key2, sizeof(key2));
          331  +  if( pOpen==0 ){
          332  +    struct openCnt *pOld;
          333  +    pOpen = sqliteMallocRaw( sizeof(*pOpen) );
          334  +    if( pOpen==0 ){
          335  +      releaseLockInfo(pLock);
          336  +      return 1;
          337  +    }
          338  +    pOpen->key = key2;
          339  +    pOpen->nRef = 1;
          340  +    pOpen->nLock = 0;
          341  +    pOpen->nPending = 0;
          342  +    pOpen->aPending = 0;
          343  +    pOld = sqliteHashInsert(&openHash, &pOpen->key, sizeof(key2), pOpen);
          344  +    if( pOld!=0 ){
          345  +      assert( pOld==pOpen );
          346  +      sqliteFree(pOpen);
          347  +      releaseLockInfo(pLock);
          348  +      return 1;
          349  +    }
          350  +  }else{
          351  +    pOpen->nRef++;
          352  +  }
          353  +  *ppOpen = pOpen;
          354  +  return 0;
          355  +}
          356  +
   246    357   #endif  /** POSIX advisory lock work-around **/
   247    358   
   248    359   /*
   249    360   ** If we compile with the SQLITE_TEST macro set, then the following block
   250    361   ** of code will give us the ability to simulate a disk I/O error.  This
   251    362   ** is used for testing the I/O recovery logic.
   252    363   */
................................................................................
   345    456   */
   346    457   int sqliteOsOpenReadWrite(
   347    458     const char *zFilename,
   348    459     OsFile *id,
   349    460     int *pReadonly
   350    461   ){
   351    462   #if OS_UNIX
          463  +  int rc;
   352    464     id->dirfd = -1;
   353    465     id->fd = open(zFilename, O_RDWR|O_CREAT|O_LARGEFILE|O_BINARY, 0644);
   354    466     if( id->fd<0 ){
   355    467       id->fd = open(zFilename, O_RDONLY|O_LARGEFILE|O_BINARY);
   356    468       if( id->fd<0 ){
   357    469         return SQLITE_CANTOPEN; 
   358    470       }
   359    471       *pReadonly = 1;
   360    472     }else{
   361    473       *pReadonly = 0;
   362    474     }
   363    475     sqliteOsEnterMutex();
   364         -  id->pLock = findLockInfo(id->fd);
          476  +  rc = findLockInfo(id->fd, &id->pLock, &id->pOpen);
   365    477     sqliteOsLeaveMutex();
   366         -  if( id->pLock==0 ){
          478  +  if( rc ){
   367    479       close(id->fd);
   368    480       return SQLITE_NOMEM;
   369    481     }
   370    482     id->locked = 0;
   371    483     TRACE3("OPEN    %-3d %s\n", id->fd, zFilename);
   372    484     OpenCounter(+1);
   373    485     return SQLITE_OK;
................................................................................
   467    579   **
   468    580   ** On success, write the file handle into *id and return SQLITE_OK.
   469    581   **
   470    582   ** On failure, return SQLITE_CANTOPEN.
   471    583   */
   472    584   int sqliteOsOpenExclusive(const char *zFilename, OsFile *id, int delFlag){
   473    585   #if OS_UNIX
          586  +  int rc;
   474    587     if( access(zFilename, 0)==0 ){
   475    588       return SQLITE_CANTOPEN;
   476    589     }
   477    590     id->dirfd = -1;
   478    591     id->fd = open(zFilename,
   479    592                   O_RDWR|O_CREAT|O_EXCL|O_NOFOLLOW|O_LARGEFILE|O_BINARY, 0600);
   480    593     if( id->fd<0 ){
   481    594       return SQLITE_CANTOPEN;
   482    595     }
   483    596     sqliteOsEnterMutex();
   484         -  id->pLock = findLockInfo(id->fd);
          597  +  rc = findLockInfo(id->fd, &id->pLock, &id->pOpen);
   485    598     sqliteOsLeaveMutex();
   486         -  if( id->pLock==0 ){
          599  +  if( rc ){
   487    600       close(id->fd);
   488    601       unlink(zFilename);
   489    602       return SQLITE_NOMEM;
   490    603     }
   491    604     id->locked = 0;
   492    605     if( delFlag ){
   493    606       unlink(zFilename);
................................................................................
   557    670   **
   558    671   ** On success, write the file handle into *id and return SQLITE_OK.
   559    672   **
   560    673   ** On failure, return SQLITE_CANTOPEN.
   561    674   */
   562    675   int sqliteOsOpenReadOnly(const char *zFilename, OsFile *id){
   563    676   #if OS_UNIX
          677  +  int rc;
   564    678     id->dirfd = -1;
   565    679     id->fd = open(zFilename, O_RDONLY|O_LARGEFILE|O_BINARY);
   566    680     if( id->fd<0 ){
   567    681       return SQLITE_CANTOPEN;
   568    682     }
   569    683     sqliteOsEnterMutex();
   570         -  id->pLock = findLockInfo(id->fd);
          684  +  rc = findLockInfo(id->fd, &id->pLock, &id->pOpen);
   571    685     sqliteOsLeaveMutex();
   572         -  if( id->pLock==0 ){
          686  +  if( rc ){
   573    687       close(id->fd);
   574    688       return SQLITE_NOMEM;
   575    689     }
   576    690     id->locked = 0;
   577    691     TRACE3("OPEN-RO %-3d %s\n", id->fd, zFilename);
   578    692     OpenCounter(+1);
   579    693     return SQLITE_OK;
................................................................................
   759    873       if( !sqliteOsFileExists(zBuf) ) break;
   760    874     }
   761    875   #endif
   762    876     return SQLITE_OK; 
   763    877   }
   764    878   
   765    879   /*
   766         -** Close a file
          880  +** Close a file.
   767    881   */
   768    882   int sqliteOsClose(OsFile *id){
   769    883   #if OS_UNIX
   770         -  close(id->fd);
          884  +  sqliteOsUnlock(id);
   771    885     if( id->dirfd>=0 ) close(id->dirfd);
   772    886     id->dirfd = -1;
   773    887     sqliteOsEnterMutex();
          888  +  if( id->pOpen->nLock ){
          889  +    /* If there are outstanding locks, do not actually close the file just
          890  +    ** yet because that would clear those locks.  Instead, add the file
          891  +    ** descriptor to pOpen->aPending.  It will be automatically closed when
          892  +    ** the last lock is cleared.
          893  +    */
          894  +    int *aNew;
          895  +    struct openCnt *pOpen = id->pOpen;
          896  +    pOpen->nPending++;
          897  +    aNew = sqliteRealloc( pOpen->aPending, pOpen->nPending*sizeof(int) );
          898  +    if( aNew==0 ){
          899  +      /* If a malloc fails, just leak the file descriptor */
          900  +    }else{
          901  +      pOpen->aPending = aNew;
          902  +      pOpen->aPending[pOpen->nPending-1] = id->fd;
          903  +    }
          904  +  }else{
          905  +    /* There are no outstanding locks so we can close the file immediately */
          906  +    close(id->fd);
          907  +  }
   774    908     releaseLockInfo(id->pLock);
          909  +  releaseOpenCnt(id->pOpen);
   775    910     sqliteOsLeaveMutex();
   776    911     TRACE2("CLOSE   %-3d\n", id->fd);
   777    912     OpenCounter(-1);
   778    913     return SQLITE_OK;
   779    914   #endif
   780    915   #if OS_WIN
   781    916     CloseHandle(id->h);
................................................................................
  1155   1290   #if OS_UNIX
  1156   1291     int rc;
  1157   1292     sqliteOsEnterMutex();
  1158   1293     if( id->pLock->cnt>0 ){
  1159   1294       if( !id->locked ){
  1160   1295         id->pLock->cnt++;
  1161   1296         id->locked = 1;
         1297  +      id->pOpen->nLock++;
  1162   1298       }
  1163   1299       rc = SQLITE_OK;
  1164   1300     }else if( id->locked || id->pLock->cnt==0 ){
  1165   1301       struct flock lock;
  1166   1302       int s;
  1167   1303       lock.l_type = F_RDLCK;
  1168   1304       lock.l_whence = SEEK_SET;
  1169   1305       lock.l_start = lock.l_len = 0L;
  1170   1306       s = fcntl(id->fd, F_SETLK, &lock);
  1171   1307       if( s!=0 ){
  1172   1308         rc = (errno==EINVAL) ? SQLITE_NOLFS : SQLITE_BUSY;
  1173   1309       }else{
  1174   1310         rc = SQLITE_OK;
         1311  +      if( !id->locked ){
         1312  +        id->pOpen->nLock++;
         1313  +        id->locked = 1;
         1314  +      }
  1175   1315         id->pLock->cnt = 1;
  1176         -      id->locked = 1;
  1177   1316       }
  1178   1317     }else{
  1179   1318       rc = SQLITE_BUSY;
  1180   1319     }
  1181   1320     sqliteOsLeaveMutex();
  1182   1321     return rc;
  1183   1322   #endif
................................................................................
  1272   1411       lock.l_whence = SEEK_SET;
  1273   1412       lock.l_start = lock.l_len = 0L;
  1274   1413       s = fcntl(id->fd, F_SETLK, &lock);
  1275   1414       if( s!=0 ){
  1276   1415         rc = (errno==EINVAL) ? SQLITE_NOLFS : SQLITE_BUSY;
  1277   1416       }else{
  1278   1417         rc = SQLITE_OK;
         1418  +      if( !id->locked ){
         1419  +        id->pOpen->nLock++;
         1420  +        id->locked = 1;
         1421  +      }
  1279   1422         id->pLock->cnt = -1;
  1280         -      id->locked = 1;
  1281   1423       }
  1282   1424     }else{
  1283   1425       rc = SQLITE_BUSY;
  1284   1426     }
  1285   1427     sqliteOsLeaveMutex();
  1286   1428     return rc;
  1287   1429   #endif
................................................................................
  1386   1528       s = fcntl(id->fd, F_SETLK, &lock);
  1387   1529       if( s!=0 ){
  1388   1530         rc = (errno==EINVAL) ? SQLITE_NOLFS : SQLITE_BUSY;
  1389   1531       }else{
  1390   1532         rc = SQLITE_OK;
  1391   1533         id->pLock->cnt = 0;
  1392   1534       }
         1535  +  }
         1536  +  if( rc==SQLITE_OK ){
         1537  +    /* Decrement the count of locks against this same file.  When the
         1538  +    ** count reaches zero, close any other file descriptors whose close
         1539  +    ** was deferred because of outstanding locks.
         1540  +    */
         1541  +    struct openCnt *pOpen = id->pOpen;
         1542  +    pOpen->nLock--;
         1543  +    assert( pOpen->nLock>=0 );
         1544  +    if( pOpen->nLock==0 && pOpen->nPending>0 ){
         1545  +      int i;
         1546  +      for(i=0; i<pOpen->nPending; i++){
         1547  +        close(pOpen->aPending[i]);
         1548  +      }
         1549  +      sqliteFree(pOpen->aPending);
         1550  +      pOpen->nPending = 0;
         1551  +      pOpen->aPending = 0;
         1552  +    }
  1393   1553     }
  1394   1554     sqliteOsLeaveMutex();
  1395   1555     id->locked = 0;
  1396   1556     return rc;
  1397   1557   #endif
  1398   1558   #if OS_WIN
  1399   1559     int rc;

Changes to src/os.h.

    99     99   #if OS_UNIX
   100    100   # include <sys/types.h>
   101    101   # include <sys/stat.h>
   102    102   # include <fcntl.h>
   103    103   # include <unistd.h>
   104    104     typedef struct OsFile OsFile;
   105    105     struct OsFile {
   106         -    struct lockInfo *pLock;  /* Information about locks on this inode */
   107         -    int fd;                  /* The file descriptor */
   108         -    int locked;              /* True if this user holds the lock */
   109         -    int dirfd;               /* File descriptor for the directory */
          106  +    struct openCnt *pOpen;    /* Info about all open fd's on this inode */
          107  +    struct lockInfo *pLock;   /* Info about locks on this inode */
          108  +    int fd;                   /* The file descriptor */
          109  +    int locked;               /* True if this instance holds the lock */
          110  +    int dirfd;                /* File descriptor for the directory */
   110    111     };
   111    112   # define SQLITE_TEMPNAME_SIZE 200
   112    113   # if defined(HAVE_USLEEP) && HAVE_USLEEP
   113    114   #  define SQLITE_MIN_SLEEP_MS 1
   114    115   # else
   115    116   #  define SQLITE_MIN_SLEEP_MS 1000
   116    117   # endif