SQLite

Check-in [55e47ef338]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Modify the code in vdbesort.c so that most reads and writes to temporary files are aligned page-sized blocks.
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | sorter-coalesce-writes
Files: files | file ages | folders
SHA1: 55e47ef338c42f95f0f071d6ec92cd2480f9f1fe
User & Date: dan 2012-07-23 19:25:39.921
Context
2012-07-23
20:10
Fix an edge case in vdbesort.c. (check-in: 4ba266fc53 user: dan tags: sorter-coalesce-writes)
19:25
Modify the code in vdbesort.c so that most reads and writes to temporary files are aligned page-sized blocks. (check-in: 55e47ef338 user: dan tags: sorter-coalesce-writes)
2012-07-17
14:37
Ensure that there is always at least one aReadMark slot usable by an unprivileged reader while a checkpoint is running. Also, if one or more transactions are recovered from a log file, initialize one of the aReadMark slots to contain mxFrame as part of the recovery process. (check-in: e416359633 user: dan tags: trunk)
Changes
Unified Diff Ignore Whitespace Patch
Changes to src/vdbesort.c.
18
19
20
21
22
23
24

25
26
27
28
29
30
31
#include "sqliteInt.h"
#include "vdbeInt.h"

#ifndef SQLITE_OMIT_MERGE_SORT

typedef struct VdbeSorterIter VdbeSorterIter;
typedef struct SorterRecord SorterRecord;


/*
** NOTES ON DATA STRUCTURE USED FOR N-WAY MERGES:
**
** As keys are added to the sorter, they are written to disk in a series
** of sorted packed-memory-arrays (PMAs). The size of each PMA is roughly
** the same as the cache-size allowed for temporary databases. In order







>







18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
#include "sqliteInt.h"
#include "vdbeInt.h"

#ifndef SQLITE_OMIT_MERGE_SORT

typedef struct VdbeSorterIter VdbeSorterIter;
typedef struct SorterRecord SorterRecord;
typedef struct FileWriter FileWriter;

/*
** NOTES ON DATA STRUCTURE USED FOR N-WAY MERGES:
**
** As keys are added to the sorter, they are written to disk in a series
** of sorted packed-memory-arrays (PMAs). The size of each PMA is roughly
** the same as the cache-size allowed for temporary databases. In order
115
116
117
118
119
120
121
















122
123
124
125
126
127
128
  i64 iReadOff;                   /* Current read offset */
  i64 iEof;                       /* 1 byte past EOF for this iterator */
  int nAlloc;                     /* Bytes of space at aAlloc */
  int nKey;                       /* Number of bytes in key */
  sqlite3_file *pFile;            /* File iterator is reading from */
  u8 *aAlloc;                     /* Allocated space */
  u8 *aKey;                       /* Pointer to current key */
















};

/*
** A structure to store a single record. All in-memory records are connected
** together into a linked list headed at VdbeSorter.pRecord using the 
** SorterRecord.pNext pointer.
*/







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
  i64 iReadOff;                   /* Current read offset */
  i64 iEof;                       /* 1 byte past EOF for this iterator */
  int nAlloc;                     /* Bytes of space at aAlloc */
  int nKey;                       /* Number of bytes in key */
  sqlite3_file *pFile;            /* File iterator is reading from */
  u8 *aAlloc;                     /* Allocated space */
  u8 *aKey;                       /* Pointer to current key */
  u8 *aBuffer;                    /* Current read buffer */
  int nBuffer;                    /* Size of read buffer in bytes */
};

/*
** An instance of this structure is used to separate the stream of records
** being written to files by the merge-sort code into aligned, page-sized
** blocks.
*/
struct FileWriter {
  u8 *aBuffer;                    /* Pointer to write buffer */
  int nBuffer;                    /* Size of write buffer in bytes */
  int iBufStart;                  /* First byte of buffer to write */
  int iBufEnd;                    /* Last byte of buffer to write */
  i64 iWriteOff;                  /* Offset of start of buffer in file */
  sqlite3_file *pFile;            /* File to write to */
};

/*
** A structure to store a single record. All in-memory records are connected
** together into a linked list headed at VdbeSorter.pRecord using the 
** SorterRecord.pNext pointer.
*/
140
141
142
143
144
145
146

147
148

































































































149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267



268
269
270

271
272
273
274



275
276
277

278



279



280




281

282


283
284
285
286
287
288
289

/*
** Free all memory belonging to the VdbeSorterIter object passed as the second
** argument. All structure fields are set to zero before returning.
*/
static void vdbeSorterIterZero(sqlite3 *db, VdbeSorterIter *pIter){
  sqlite3DbFree(db, pIter->aAlloc);

  memset(pIter, 0, sizeof(VdbeSorterIter));
}


































































































/*
** Advance iterator pIter to the next key in its PMA. Return SQLITE_OK if
** no error occurs, or an SQLite error code if one does.
*/
static int vdbeSorterIterNext(
  sqlite3 *db,                    /* Database handle (for sqlite3DbMalloc() ) */
  VdbeSorterIter *pIter           /* Iterator to advance */
){
  int rc;                         /* Return Code */
  int nRead;                      /* Number of bytes read */
  int nRec = 0;                   /* Size of record in bytes */
  int iOff = 0;                   /* Size of serialized size varint in bytes */

  assert( pIter->iEof>=pIter->iReadOff );
  if( pIter->iEof-pIter->iReadOff>5 ){
    nRead = 5;
  }else{
    nRead = (int)(pIter->iEof - pIter->iReadOff);
  }
  if( nRead<=0 ){
    /* This is an EOF condition */
    vdbeSorterIterZero(db, pIter);
    return SQLITE_OK;
  }

  rc = sqlite3OsRead(pIter->pFile, pIter->aAlloc, nRead, pIter->iReadOff);
  if( rc==SQLITE_OK ){
    iOff = getVarint32(pIter->aAlloc, nRec);
    if( (iOff+nRec)>nRead ){
      int nRead2;                   /* Number of extra bytes to read */
      if( (iOff+nRec)>pIter->nAlloc ){
        int nNew = pIter->nAlloc*2;
        while( (iOff+nRec)>nNew ) nNew = nNew*2;
        pIter->aAlloc = sqlite3DbReallocOrFree(db, pIter->aAlloc, nNew);
        if( !pIter->aAlloc ) return SQLITE_NOMEM;
        pIter->nAlloc = nNew;
      }
  
      nRead2 = iOff + nRec - nRead;
      rc = sqlite3OsRead(
          pIter->pFile, &pIter->aAlloc[nRead], nRead2, pIter->iReadOff+nRead
      );
    }
  }

  assert( rc!=SQLITE_OK || nRec>0 );
  pIter->iReadOff += iOff+nRec;
  pIter->nKey = nRec;
  pIter->aKey = &pIter->aAlloc[iOff];
  return rc;
}

/*
** Write a single varint, value iVal, to file-descriptor pFile. Return
** SQLITE_OK if successful, or an SQLite error code if some error occurs.
**
** The value of *piOffset when this function is called is used as the byte
** offset in file pFile to write to. Before returning, *piOffset is 
** incremented by the number of bytes written.
*/
static int vdbeSorterWriteVarint(
  sqlite3_file *pFile,            /* File to write to */
  i64 iVal,                       /* Value to write as a varint */
  i64 *piOffset                   /* IN/OUT: Write offset in file pFile */
){
  u8 aVarint[9];                  /* Buffer large enough for a varint */
  int nVarint;                    /* Number of used bytes in varint */
  int rc;                         /* Result of write() call */

  nVarint = sqlite3PutVarint(aVarint, iVal);
  rc = sqlite3OsWrite(pFile, aVarint, nVarint, *piOffset);
  *piOffset += nVarint;

  return rc;
}

/*
** Read a single varint from file-descriptor pFile. Return SQLITE_OK if
** successful, or an SQLite error code if some error occurs.
**
** The value of *piOffset when this function is called is used as the
** byte offset in file pFile from whence to read the varint. If successful
** (i.e. if no IO error occurs), then *piOffset is set to the offset of
** the first byte past the end of the varint before returning. *piVal is
** set to the integer value read. If an error occurs, the final values of
** both *piOffset and *piVal are undefined.
*/
static int vdbeSorterReadVarint(
  sqlite3_file *pFile,            /* File to read from */
  i64 *piOffset,                  /* IN/OUT: Read offset in pFile */
  i64 *piVal                      /* OUT: Value read from file */
){
  u8 aVarint[9];                  /* Buffer large enough for a varint */
  i64 iOff = *piOffset;           /* Offset in file to read from */
  int rc;                         /* Return code */

  rc = sqlite3OsRead(pFile, aVarint, 9, iOff);
  if( rc==SQLITE_OK ){
    *piOffset += getVarint(aVarint, (u64 *)piVal);
  }

  return rc;
}

/*
** Initialize iterator pIter to scan through the PMA stored in file pFile
** starting at offset iStart and ending at offset iEof-1. This function 
** leaves the iterator pointing to the first key in the PMA (or EOF if the 
** PMA is empty).
*/
static int vdbeSorterIterInit(
  sqlite3 *db,                    /* Database handle */
  VdbeSorter *pSorter,            /* Sorter object */
  i64 iStart,                     /* Start offset in pFile */
  VdbeSorterIter *pIter,          /* Iterator to populate */
  i64 *pnByte                     /* IN/OUT: Increment this value by PMA size */
){
  int rc;




  assert( pSorter->iWriteOff>iStart );
  assert( pIter->aAlloc==0 );

  pIter->pFile = pSorter->pTemp1;
  pIter->iReadOff = iStart;
  pIter->nAlloc = 128;
  pIter->aAlloc = (u8 *)sqlite3DbMallocRaw(db, pIter->nAlloc);



  if( !pIter->aAlloc ){
    rc = SQLITE_NOMEM;
  }else{

    i64 nByte;                         /* Total size of PMA in bytes */



    rc = vdbeSorterReadVarint(pSorter->pTemp1, &pIter->iReadOff, &nByte);



    *pnByte += nByte;




    pIter->iEof = pIter->iReadOff + nByte;

  }


  if( rc==SQLITE_OK ){
    rc = vdbeSorterIterNext(db, pIter);
  }
  return rc;
}









>


>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>










<
|
<

<
|
<
<
<
<
<





|

<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
|
<
<
<
|
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<
<


















|
>
>
>



>




>
>
>
|


>
|
>
>
>
|
>
>
>
|
>
>
>
>
|
>
|
>
>







157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273

274

275

276





277
278
279
280
281
282
283




















284



285















































286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347

/*
** Free all memory belonging to the VdbeSorterIter object passed as the second
** argument. All structure fields are set to zero before returning.
*/
static void vdbeSorterIterZero(sqlite3 *db, VdbeSorterIter *pIter){
  sqlite3DbFree(db, pIter->aAlloc);
  sqlite3DbFree(db, pIter->aBuffer);
  memset(pIter, 0, sizeof(VdbeSorterIter));
}

/*
** Read nByte bytes of data from the stream of data iterated by object p.
** If successful, set *ppOut to point to a buffer containing the data
** and return SQLITE_OK. Otherwise, if an error occurs, return an SQLite
** error code.
**
** The buffer indicated by *ppOut may only be considered valid until the
** next call to this function.
*/
static int vdbeSorterIterRead(
  sqlite3 *db,                    /* Database handle (for malloc) */
  VdbeSorterIter *p,              /* Iterator */
  int nByte,                      /* Bytes of data to read */
  u8 **ppOut                      /* OUT: Pointer to buffer containing data */
){
  int iBuf;
  int nAvail;
  assert( p->aBuffer );

  iBuf = p->iReadOff % p->nBuffer;
  if( iBuf==0 ){
    int nRead;
    int rc;

    nRead = p->iEof - p->iReadOff;
    if( nRead>p->nBuffer ) nRead = p->nBuffer;
    assert( nRead>0 );
    rc = sqlite3OsRead(p->pFile, p->aBuffer, nRead, p->iReadOff);
    assert( rc!=SQLITE_IOERR_SHORT_READ );
    if( rc!=SQLITE_OK ) return rc;
  }
  nAvail = p->nBuffer - iBuf;

  if( nByte<=nAvail ){
    *ppOut = &p->aBuffer[iBuf];
    p->iReadOff += nByte;
  }else{
    int nRem;
    if( p->nAlloc<nByte ){
      int nNew = p->nAlloc*2;
      while( nByte>nNew ) nNew = nNew*2;

      p->aAlloc = sqlite3DbReallocOrFree(db, p->aAlloc, nNew);
      if( !p->aAlloc ) return SQLITE_NOMEM;
    }

    memcpy(p->aAlloc, &p->aBuffer[iBuf], nAvail);
    p->iReadOff += nAvail;
    nRem = nByte - nAvail;
    while( nRem>0 ){
      int rc;
      int nCopy;
      u8 *aNext;

      nCopy = nRem;
      if( nRem>p->nBuffer ) nCopy = p->nBuffer;
      rc = vdbeSorterIterRead(db, p, nCopy, &aNext);
      if( rc!=SQLITE_OK ) return rc;
      assert( aNext!=p->aAlloc );

      memcpy(&p->aAlloc[nByte - nRem], aNext, nCopy);
      nRem -= nCopy;
    }

    *ppOut = p->aAlloc;
  }

  return SQLITE_OK;
}

/*
** Read a varint from the stream of data accessed by p. Set *pnOut to
** the value read.
*/
static int vdbeSorterIterVarint(sqlite3 *db, VdbeSorterIter *p, u64 *pnOut){
  int iBuf;

  iBuf = p->iReadOff % p->nBuffer;
  if( iBuf && (p->nBuffer-iBuf)>=9 ){
    p->iReadOff += sqlite3GetVarint(&p->aBuffer[iBuf], pnOut);
  }else{
    u8 aVarint[9];
    int i;
    for(i=0; i<sizeof(aVarint); i++){
      u8 *a;
      int rc = vdbeSorterIterRead(db, p, 1, &a);
      if( rc ) return rc;
      aVarint[i] = *a;
      if( (aVarint[i] & 0x80)==0 ) break;
    }
    sqlite3GetVarint(aVarint, pnOut);
  }

  return SQLITE_OK;
}


/*
** Advance iterator pIter to the next key in its PMA. Return SQLITE_OK if
** no error occurs, or an SQLite error code if one does.
*/
static int vdbeSorterIterNext(
  sqlite3 *db,                    /* Database handle (for sqlite3DbMalloc() ) */
  VdbeSorterIter *pIter           /* Iterator to advance */
){
  int rc;                         /* Return Code */

  u64 nRec = 0;                   /* Size of record in bytes */



  if( pIter->iReadOff>=pIter->iEof ){





    /* This is an EOF condition */
    vdbeSorterIterZero(db, pIter);
    return SQLITE_OK;
  }

  rc = vdbeSorterIterVarint(db, pIter, &nRec);
  if( rc==SQLITE_OK ){




















    pIter->nKey = (int)nRec;



    rc = vdbeSorterIterRead(db, pIter, nRec, &pIter->aKey);















































  }

  return rc;
}

/*
** Initialize iterator pIter to scan through the PMA stored in file pFile
** starting at offset iStart and ending at offset iEof-1. This function 
** leaves the iterator pointing to the first key in the PMA (or EOF if the 
** PMA is empty).
*/
static int vdbeSorterIterInit(
  sqlite3 *db,                    /* Database handle */
  VdbeSorter *pSorter,            /* Sorter object */
  i64 iStart,                     /* Start offset in pFile */
  VdbeSorterIter *pIter,          /* Iterator to populate */
  i64 *pnByte                     /* IN/OUT: Increment this value by PMA size */
){
  int rc = SQLITE_OK;
  int nBuf;

  nBuf = sqlite3BtreeGetPageSize(db->aDb[0].pBt);

  assert( pSorter->iWriteOff>iStart );
  assert( pIter->aAlloc==0 );
  assert( pIter->aBuffer==0 );
  pIter->pFile = pSorter->pTemp1;
  pIter->iReadOff = iStart;
  pIter->nAlloc = 128;
  pIter->aAlloc = (u8 *)sqlite3DbMallocRaw(db, pIter->nAlloc);
  pIter->nBuffer = nBuf;
  pIter->aBuffer = (u8 *)sqlite3DbMallocRaw(db, nBuf);

  if( !pIter->aBuffer ){
    rc = SQLITE_NOMEM;
  }else{
    int iBuf;

    iBuf = pIter->iReadOff % nBuf;
    if( iBuf ){
      rc = sqlite3OsRead(
          pSorter->pTemp1, &pIter->aBuffer[iBuf], nBuf-iBuf, iStart
      );
      assert( rc!=SQLITE_IOERR_SHORT_READ );
    }

    if( rc==SQLITE_OK ){
      u64 nByte;                       /* Size of PMA in bytes */
      pIter->iEof = iStart + pIter->nBuffer;
      rc = vdbeSorterIterVarint(db, pIter, &nByte);
      pIter->iEof = pIter->iReadOff + nByte;
      *pnByte += nByte;
    }
  }

  if( rc==SQLITE_OK ){
    rc = vdbeSorterIterNext(db, pIter);
  }
  return rc;
}


527
528
529
530
531
532
533






















































































534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549

550



551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567


568

569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593

594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
  }
  pSorter->pRecord = p;

  sqlite3_free(aSlot);
  return SQLITE_OK;
}
























































































/*
** Write the current contents of the in-memory linked-list to a PMA. Return
** SQLITE_OK if successful, or an SQLite error code otherwise.
**
** The format of a PMA is:
**
**     * A varint. This varint contains the total number of bytes of content
**       in the PMA (not including the varint itself).
**
**     * One or more records packed end-to-end in order of ascending keys. 
**       Each record consists of a varint followed by a blob of data (the 
**       key). The varint is the number of bytes in the blob of data.
*/
static int vdbeSorterListToPMA(sqlite3 *db, VdbeCursor *pCsr){
  int rc = SQLITE_OK;             /* Return code */

  VdbeSorter *pSorter = pCsr->pSorter;




  if( pSorter->nInMemory==0 ){
    assert( pSorter->pRecord==0 );
    return rc;
  }

  rc = vdbeSorterSort(pCsr);

  /* If the first temporary PMA file has not been opened, open it now. */
  if( rc==SQLITE_OK && pSorter->pTemp1==0 ){
    rc = vdbeSorterOpenTempFile(db, &pSorter->pTemp1);
    assert( rc!=SQLITE_OK || pSorter->pTemp1 );
    assert( pSorter->iWriteOff==0 );
    assert( pSorter->nPMA==0 );
  }

  if( rc==SQLITE_OK ){


    i64 iOff = pSorter->iWriteOff;

    SorterRecord *p;
    SorterRecord *pNext = 0;
    static const char eightZeros[8] = { 0, 0, 0, 0, 0, 0, 0, 0 };

    pSorter->nPMA++;
    rc = vdbeSorterWriteVarint(pSorter->pTemp1, pSorter->nInMemory, &iOff);
    for(p=pSorter->pRecord; rc==SQLITE_OK && p; p=pNext){
      pNext = p->pNext;
      rc = vdbeSorterWriteVarint(pSorter->pTemp1, p->nVal, &iOff);

      if( rc==SQLITE_OK ){
        rc = sqlite3OsWrite(pSorter->pTemp1, p->pVal, p->nVal, iOff);
        iOff += p->nVal;
      }

      sqlite3DbFree(db, p);
    }

    /* This assert verifies that unless an error has occurred, the size of 
    ** the PMA on disk is the same as the expected size stored in
    ** pSorter->nInMemory. */ 
    assert( rc!=SQLITE_OK || pSorter->nInMemory==(
          iOff-pSorter->iWriteOff-sqlite3VarintLen(pSorter->nInMemory)
    ));


    pSorter->iWriteOff = iOff;
    if( rc==SQLITE_OK ){
      /* Terminate each file with 8 extra bytes so that from any offset
      ** in the file we can always read 9 bytes without a SHORT_READ error */
      rc = sqlite3OsWrite(pSorter->pTemp1, eightZeros, 8, iOff);
    }
    pSorter->pRecord = p;
  }

  return rc;
}

/*
** Add a record to the sorter.
*/







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
















>

>
>
>

















>
>
|
>


|


|


<
|

|
<





<
<
|
<
<
<
|
>
|
|
<
<
<
<
<
<







585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727

728
729
730

731
732
733
734
735


736



737
738
739
740






741
742
743
744
745
746
747
  }
  pSorter->pRecord = p;

  sqlite3_free(aSlot);
  return SQLITE_OK;
}

/*
** Initialize a file-writer object.
*/
static int fileWriterInit(
  sqlite3 *db,                    /* Database (for malloc) */
  sqlite3_file *pFile,            /* File to write to */
  FileWriter *p,                  /* Object to populate */
  i64 iStart                      /* Offset of pFile to begin writing at */
){
  int nBuf = sqlite3BtreeGetPageSize(db->aDb[0].pBt);

  memset(p, 0, sizeof(FileWriter));
  p->aBuffer = (u8 *)sqlite3DbMallocRaw(db, nBuf);
  if( !p->aBuffer ) return SQLITE_NOMEM;

  p->iBufEnd = p->iBufStart = (iStart % nBuf);
  p->iWriteOff = iStart - p->iBufStart;
  p->nBuffer = nBuf;
  p->pFile = pFile;
  return SQLITE_OK;
}

/*
** Write nData bytes of data to the file-write object. Return SQLITE_OK
** if successful, or an SQLite error code if an error occurs.
*/
static int fileWriterWrite(FileWriter *p, u8 *pData, int nData){
  int nRem = nData;
  while( nRem>0 ){
    int nCopy = nRem;
    if( nCopy>(p->nBuffer - p->iBufEnd) ){
      nCopy = p->nBuffer - p->iBufEnd;
    }

    memcpy(&p->aBuffer[p->iBufEnd], &pData[nData-nRem], nCopy);
    p->iBufEnd += nCopy;
    if( p->iBufEnd==p->nBuffer ){
      int rc = sqlite3OsWrite(p->pFile, 
          &p->aBuffer[p->iBufStart], p->iBufEnd - p->iBufStart, 
          p->iWriteOff + p->iBufStart
      );
      if( rc!=SQLITE_OK ) return rc;
      p->iBufStart = p->iBufEnd = 0;
      p->iWriteOff += p->nBuffer;
    }
    assert( p->iBufEnd<p->nBuffer );

    nRem -= nCopy;
  }

  return SQLITE_OK;
}

/*
** Flush any buffered data to disk and clean up the file-writer object.
** The results of using the file-writer after this call are undefined.
** Return SQLITE_OK if flushing the buffered data succeeds or is not 
** required. Otherwise, return an SQLite error code.
**
** Before returning, set *piEof to the offset immediately following the
** last byte written to the file.
*/
static int fileWriterFinish(sqlite3 *db, FileWriter *p, i64 *piEof){
  int rc = SQLITE_OK;
  if( p->aBuffer && p->iBufEnd>p->iBufStart ){
    rc = sqlite3OsWrite(p->pFile, 
        &p->aBuffer[p->iBufStart], p->iBufEnd - p->iBufStart, 
        p->iWriteOff + p->iBufStart
    );
  }
  *piEof = (p->iWriteOff + p->iBufEnd);
  sqlite3DbFree(db, p->aBuffer);
  memset(p, 0, sizeof(FileWriter));
  return rc;
}

/*
** Write value iVal encoded as a varint to the file-write object. Return 
** SQLITE_OK if successful, or an SQLite error code if an error occurs.
*/
static int fileWriterWriteVarint(FileWriter *p, u64 iVal){
  int nByte; 
  u8 aByte[10];
  nByte = sqlite3PutVarint(aByte, iVal);
  return fileWriterWrite(p, aByte, nByte);
}

/*
** Write the current contents of the in-memory linked-list to a PMA. Return
** SQLITE_OK if successful, or an SQLite error code otherwise.
**
** The format of a PMA is:
**
**     * A varint. This varint contains the total number of bytes of content
**       in the PMA (not including the varint itself).
**
**     * One or more records packed end-to-end in order of ascending keys. 
**       Each record consists of a varint followed by a blob of data (the 
**       key). The varint is the number of bytes in the blob of data.
*/
static int vdbeSorterListToPMA(sqlite3 *db, VdbeCursor *pCsr){
  int rc = SQLITE_OK;             /* Return code */
  int rc2;                        /* fileWriterFinish return code */
  VdbeSorter *pSorter = pCsr->pSorter;
  FileWriter writer;

  memset(&writer, 0, sizeof(FileWriter));

  if( pSorter->nInMemory==0 ){
    assert( pSorter->pRecord==0 );
    return rc;
  }

  rc = vdbeSorterSort(pCsr);

  /* If the first temporary PMA file has not been opened, open it now. */
  if( rc==SQLITE_OK && pSorter->pTemp1==0 ){
    rc = vdbeSorterOpenTempFile(db, &pSorter->pTemp1);
    assert( rc!=SQLITE_OK || pSorter->pTemp1 );
    assert( pSorter->iWriteOff==0 );
    assert( pSorter->nPMA==0 );
  }

  if( rc==SQLITE_OK ){
    rc = fileWriterInit(db, pSorter->pTemp1, &writer, pSorter->iWriteOff);
  }

  if( rc==SQLITE_OK ){
    SorterRecord *p;
    SorterRecord *pNext = 0;


    pSorter->nPMA++;
    rc = fileWriterWriteVarint(&writer, pSorter->nInMemory);
    for(p=pSorter->pRecord; rc==SQLITE_OK && p; p=pNext){
      pNext = p->pNext;

      rc = fileWriterWriteVarint(&writer, p->nVal);
      if( rc==SQLITE_OK ){
        rc = fileWriterWrite(&writer, p->pVal, p->nVal);

      }

      sqlite3DbFree(db, p);
    }



    pSorter->pRecord = p;



  }

  rc2 = fileWriterFinish(db, &writer, &pSorter->iWriteOff);
  if( rc==SQLITE_OK ) rc = rc2;







  return rc;
}

/*
** Add a record to the sorter.
*/
638
639
640
641
642
643
644





645
646

647
648
649
650
651
652
653
  **   * The total memory allocated for the in-memory list is greater 
  **     than (page-size * 10) and sqlite3HeapNearlyFull() returns true.
  */
  if( rc==SQLITE_OK && pSorter->mxPmaSize>0 && (
        (pSorter->nInMemory>pSorter->mxPmaSize)
     || (pSorter->nInMemory>pSorter->mnPmaSize && sqlite3HeapNearlyFull())
  )){





    rc = vdbeSorterListToPMA(db, pCsr);
    pSorter->nInMemory = 0;

  }

  return rc;
}

/*
** Helper function for sqlite3VdbeSorterRewind(). 







>
>
>
>
>


>







777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
  **   * The total memory allocated for the in-memory list is greater 
  **     than (page-size * 10) and sqlite3HeapNearlyFull() returns true.
  */
  if( rc==SQLITE_OK && pSorter->mxPmaSize>0 && (
        (pSorter->nInMemory>pSorter->mxPmaSize)
     || (pSorter->nInMemory>pSorter->mnPmaSize && sqlite3HeapNearlyFull())
  )){
#ifdef SQLITE_DEBUG
    i64 nExpect = pSorter->iWriteOff
                + sqlite3VarintLen(pSorter->nInMemory)
                + pSorter->nInMemory;
#endif
    rc = vdbeSorterListToPMA(db, pCsr);
    pSorter->nInMemory = 0;
    assert( rc!=SQLITE_OK || (nExpect==pSorter->iWriteOff) );
  }

  return rc;
}

/*
** Helper function for sqlite3VdbeSorterRewind(). 
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
  ** from the in-memory list.  */
  if( pSorter->nPMA==0 ){
    *pbEof = !pSorter->pRecord;
    assert( pSorter->aTree==0 );
    return vdbeSorterSort(pCsr);
  }

  /* Write the current b-tree to a PMA. Close the b-tree cursor. */
  rc = vdbeSorterListToPMA(db, pCsr);
  if( rc!=SQLITE_OK ) return rc;

  /* Allocate space for aIter[] and aTree[]. */
  nIter = pSorter->nPMA;
  if( nIter>SORTER_MAX_MERGE_COUNT ) nIter = SORTER_MAX_MERGE_COUNT;
  assert( nIter>0 );







|







845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
  ** from the in-memory list.  */
  if( pSorter->nPMA==0 ){
    *pbEof = !pSorter->pRecord;
    assert( pSorter->aTree==0 );
    return vdbeSorterSort(pCsr);
  }

  /* Write the current in-memory list to a PMA. */
  rc = vdbeSorterListToPMA(db, pCsr);
  if( rc!=SQLITE_OK ) return rc;

  /* Allocate space for aIter[] and aTree[]. */
  nIter = pSorter->nPMA;
  if( nIter>SORTER_MAX_MERGE_COUNT ) nIter = SORTER_MAX_MERGE_COUNT;
  assert( nIter>0 );
722
723
724
725
726
727
728


729


730
731
732
733
734
735
736
  do {
    int iNew;                     /* Index of new, merged, PMA */

    for(iNew=0; 
        rc==SQLITE_OK && iNew*SORTER_MAX_MERGE_COUNT<pSorter->nPMA; 
        iNew++
    ){


      i64 nWrite;                 /* Number of bytes in new PMA */



      /* If there are SORTER_MAX_MERGE_COUNT or less PMAs in file pTemp1,
      ** initialize an iterator for each of them and break out of the loop.
      ** These iterators will be incrementally merged as the VDBE layer calls
      ** sqlite3VdbeSorterNext().
      **
      ** Otherwise, if pTemp1 contains more than SORTER_MAX_MERGE_COUNT PMAs,







>
>

>
>







867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
  do {
    int iNew;                     /* Index of new, merged, PMA */

    for(iNew=0; 
        rc==SQLITE_OK && iNew*SORTER_MAX_MERGE_COUNT<pSorter->nPMA; 
        iNew++
    ){
      int rc2;                    /* Return code from fileWriterFinish() */
      FileWriter writer;          /* Object used to write to disk */
      i64 nWrite;                 /* Number of bytes in new PMA */

      memset(&writer, 0, sizeof(FileWriter));

      /* If there are SORTER_MAX_MERGE_COUNT or less PMAs in file pTemp1,
      ** initialize an iterator for each of them and break out of the loop.
      ** These iterators will be incrementally merged as the VDBE layer calls
      ** sqlite3VdbeSorterNext().
      **
      ** Otherwise, if pTemp1 contains more than SORTER_MAX_MERGE_COUNT PMAs,
745
746
747
748
749
750
751


752
753
754
755
756
757
758
759
760
761
762
763

764

765
766
767
768
769



770
771
772
773
774
775
776

      /* Open the second temp file, if it is not already open. */
      if( pTemp2==0 ){
        assert( iWrite2==0 );
        rc = vdbeSorterOpenTempFile(db, &pTemp2);
      }



      if( rc==SQLITE_OK ){
        rc = vdbeSorterWriteVarint(pTemp2, nWrite, &iWrite2);
      }

      if( rc==SQLITE_OK ){
        int bEof = 0;
        while( rc==SQLITE_OK && bEof==0 ){
          int nToWrite;
          VdbeSorterIter *pIter = &pSorter->aIter[ pSorter->aTree[1] ];
          assert( pIter->pFile );
          nToWrite = pIter->nKey + sqlite3VarintLen(pIter->nKey);
          rc = sqlite3OsWrite(pTemp2, pIter->aAlloc, nToWrite, iWrite2);

          iWrite2 += nToWrite;

          if( rc==SQLITE_OK ){
            rc = sqlite3VdbeSorterNext(db, pCsr, &bEof);
          }
        }
      }



    }

    if( pSorter->nPMA<=SORTER_MAX_MERGE_COUNT ){
      break;
    }else{
      sqlite3_file *pTmp = pSorter->pTemp1;
      pSorter->nPMA = iNew;







>
>

|





<


|
|
>
|
>





>
>
>







894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909

910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931

      /* Open the second temp file, if it is not already open. */
      if( pTemp2==0 ){
        assert( iWrite2==0 );
        rc = vdbeSorterOpenTempFile(db, &pTemp2);
      }

      rc = fileWriterInit(db, pTemp2, &writer, iWrite2);

      if( rc==SQLITE_OK ){
        rc = fileWriterWriteVarint(&writer, nWrite);
      }

      if( rc==SQLITE_OK ){
        int bEof = 0;
        while( rc==SQLITE_OK && bEof==0 ){

          VdbeSorterIter *pIter = &pSorter->aIter[ pSorter->aTree[1] ];
          assert( pIter->pFile );

          rc = fileWriterWriteVarint(&writer, pIter->nKey);
          if( rc==SQLITE_OK ){
            rc = fileWriterWrite(&writer, pIter->aKey, pIter->nKey);
          }
          if( rc==SQLITE_OK ){
            rc = sqlite3VdbeSorterNext(db, pCsr, &bEof);
          }
        }
      }

      rc2 = fileWriterFinish(db, &writer, &iWrite2);
      if( rc==SQLITE_OK ) rc = rc2;
    }

    if( pSorter->nPMA<=SORTER_MAX_MERGE_COUNT ){
      break;
    }else{
      sqlite3_file *pTmp = pSorter->pTemp1;
      pSorter->nPMA = iNew;
Changes to test/index4.test.
12
13
14
15
16
17
18
























19
20
21
22
23
24
25
# focus of this file is testing the CREATE INDEX statement.
#

set testdir [file dirname $argv0]
source $testdir/tester.tcl

set testprefix index4

























do_execsql_test 1.1 {
  BEGIN;
    CREATE TABLE t1(x);
    INSERT INTO t1 VALUES(randomblob(102));
    INSERT INTO t1 SELECT randomblob(102) FROM t1;     --     2
    INSERT INTO t1 SELECT randomblob(102) FROM t1;     --     4







>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
# focus of this file is testing the CREATE INDEX statement.
#

set testdir [file dirname $argv0]
source $testdir/tester.tcl

set testprefix index4

#proc str {n} { string range [string repeat [format %.06d. $n] 20] 0 101 }
#db func str str
#do_execsql_test 1.1 {
#  BEGIN;
#    CREATE TABLE t1(x);
#    INSERT INTO t1 VALUES(str(1));
#    INSERT INTO t1 SELECT str(rowid + 1) FROM t1;        --     2
#    INSERT INTO t1 SELECT str(rowid + 2) FROM t1;        --     4
#    INSERT INTO t1 SELECT str(rowid + 4) FROM t1;        --     8
#    INSERT INTO t1 SELECT str(rowid + 8) FROM t1;        --    16
#    INSERT INTO t1 SELECT str(rowid + 16) FROM t1;       --    32
#    INSERT INTO t1 SELECT str(rowid + 32) FROM t1;       --    64
#    INSERT INTO t1 SELECT str(rowid + 64) FROM t1;       --   128
#    INSERT INTO t1 SELECT str(rowid + 128) FROM t1;      --   256
#    INSERT INTO t1 SELECT str(rowid + 256) FROM t1;      --   512
#    INSERT INTO t1 SELECT str(rowid + 512) FROM t1;      --  1024
#    INSERT INTO t1 SELECT str(rowid + 1024) FROM t1;     --  2048
#    INSERT INTO t1 SELECT str(rowid + 2048) FROM t1;     --  4096
#    INSERT INTO t1 SELECT str(rowid + 4096) FROM t1;     --  8192
#    INSERT INTO t1 SELECT str(rowid + 8192) FROM t1;     -- 16384
#    INSERT INTO t1 SELECT str(rowid + 16384) FROM t1;    -- 32768
#  COMMIT;
#}

do_execsql_test 1.1 {
  BEGIN;
    CREATE TABLE t1(x);
    INSERT INTO t1 VALUES(randomblob(102));
    INSERT INTO t1 SELECT randomblob(102) FROM t1;     --     2
    INSERT INTO t1 SELECT randomblob(102) FROM t1;     --     4