SQLite

Check-in [81987c8ceb]
Login

Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Instead of allocating a single large buffer at the beginning of each sort operation, start with a small buffer and extend it using realloc() as required.
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | orderby-planning
Files: files | file ages | folders
SHA1: 81987c8ceb64f051528a6ca42673821d9ab7c0ff
User & Date: dan 2014-03-27 19:25:02.222
Context
2014-03-28
18:35
Merge the latest changes from trunk. (check-in: 3047a25f1c user: drh tags: orderby-planning)
2014-03-27
19:25
Instead of allocating a single large buffer at the beginning of each sort operation, start with a small buffer and extend it using realloc() as required. (check-in: 81987c8ceb user: dan tags: orderby-planning)
17:23
Use xFetch() to access temporary files in vdbesort.c. Use a single large allocation instead of many small allocations when accumulating records in vdbesort.c. This is an interim commit - it allocates a buffer the size of the page-cache every time data is sorted. (check-in: f4ac1bf28c user: dan tags: orderby-planning)
Changes
Unified Diff Ignore Whitespace Patch
Changes to src/vdbesort.c.
103
104
105
106
107
108
109

110
111
112
113
114
115
116
  VdbeSorterIter *aIter;          /* Array of iterators to merge */
  int *aTree;                     /* Current state of incremental merge */
  sqlite3_file *pTemp1;           /* PMA file 1 */
  SorterRecord *pRecord;          /* Head of in-memory record list */
  UnpackedRecord *pUnpacked;      /* Used to unpack keys */
  u8* aMemory;                    /* Block to allocate records from */
  int iMemory;                    /* Offset of free space in aMemory */

};

/*
** The following type is an iterator for a PMA. It caches the current key in 
** variables nKey/aKey. If the iterator is at EOF, pFile==0.
*/
struct VdbeSorterIter {







>







103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
  VdbeSorterIter *aIter;          /* Array of iterators to merge */
  int *aTree;                     /* Current state of incremental merge */
  sqlite3_file *pTemp1;           /* PMA file 1 */
  SorterRecord *pRecord;          /* Head of in-memory record list */
  UnpackedRecord *pUnpacked;      /* Used to unpack keys */
  u8* aMemory;                    /* Block to allocate records from */
  int iMemory;                    /* Offset of free space in aMemory */
  int nMemory;                    /* Current size of allocation at aMemory */
};

/*
** The following type is an iterator for a PMA. It caches the current key in 
** variables nKey/aKey. If the iterator is at EOF, pFile==0.
*/
struct VdbeSorterIter {
140
141
142
143
144
145
146
147




148









149
150
151
152

153


154







155
156
157
158
159
160
161
  int iBufEnd;                    /* Last byte of buffer to write */
  i64 iWriteOff;                  /* Offset of start of buffer in file */
  sqlite3_file *pFile;            /* File to write to */
};

/*
** A structure to store a single record. All in-memory records are connected
** together into a linked list headed at VdbeSorter.pRecord using the 




** SorterRecord.pNext pointer.









*/
struct SorterRecord {
  void *pVal;
  int nVal;

  SorterRecord *pNext;


};








/* Minimum allowable value for the VdbeSorter.nWorking variable */
#define SORTER_MIN_WORKING 10

/* Maximum number of segments to merge in a single pass. */
#define SORTER_MAX_MERGE_COUNT 16








|
>
>
>
>
|
>
>
>
>
>
>
>
>
>


<

>
|
>
>

>
>
>
>
>
>
>







141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164

165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
  int iBufEnd;                    /* Last byte of buffer to write */
  i64 iWriteOff;                  /* Offset of start of buffer in file */
  sqlite3_file *pFile;            /* File to write to */
};

/*
** A structure to store a single record. All in-memory records are connected
** together into a linked list headed at VdbeSorter.pRecord.
**
** How the linked list is connected depends on how memory is being managed
** by this module. If using a separate allocation for each in-memory record
** (VdbeSorter.aMemory==0), then the list is always connected using the 
** SorterRecord.u.pNext pointers.
**
** Or, if using the single large allocation method (VdbeSorter.aMemory!=0),
** then while records are being accumulated the list is linked using the
** SorterRecord.u.iNext offset. This is because the aMemory[] array may
** be sqlite3Realloc()ed while records are being accumulated. Once the VM
** has finished passing records to the sorter, or when the in-memory buffer
** is full, the list is sorted. As part of the sorting process, it is
** converted to use the SorterRecord.u.pNext pointers. See function
** vdbeSorterSort() for details.
*/
struct SorterRecord {

  int nVal;
  union {
    SorterRecord *pNext;          /* Pointer to next record in list */
    int iNext;                    /* Offset within aMemory of next record */
  } u;
};

/* Return a pointer to the buffer containing the record data for SorterRecord
** object p. Should be used as if:
**
**   void *SRVAL(SorterRecord *p) { return (void*)&p[1]; }
*/
#define SRVAL(p) ((void*)((SorterRecord*)(p) + 1))

/* Minimum allowable value for the VdbeSorter.nWorking variable */
#define SORTER_MIN_WORKING 10

/* Maximum number of segments to merge in a single pass. */
#define SORTER_MAX_MERGE_COUNT 16

509
510
511
512
513
514
515


516

517


518

519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
  if( !sqlite3TempInMemory(db) ){
    pgsz = sqlite3BtreeGetPageSize(db->aDb[0].pBt);
    pSorter->mnPmaSize = SORTER_MIN_WORKING * pgsz;
    mxCache = db->aDb[0].pSchema->cache_size;
    if( mxCache<SORTER_MIN_WORKING ) mxCache = SORTER_MIN_WORKING;
    pSorter->mxPmaSize = mxCache * pgsz;



    pSorter->aMemory = (u8*)sqlite3DbMallocRaw(db, pSorter->mxPmaSize);

    assert( pSorter->iMemory==0 );


    if( !pSorter->aMemory ) return SQLITE_NOMEM;

  }

  return SQLITE_OK;
}

/*
** Free the list of sorted records starting at pRecord.
*/
static void vdbeSorterRecordFree(sqlite3 *db, SorterRecord *pRecord){
  SorterRecord *p;
  SorterRecord *pNext;
  for(p=pRecord; p; p=pNext){
    pNext = p->pNext;
    sqlite3DbFree(db, p);
  }
}

/*
** Reset a sorting cursor back to its original empty state.
*/







>
>
|
>
|
>
>
|
>












|







532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
  if( !sqlite3TempInMemory(db) ){
    pgsz = sqlite3BtreeGetPageSize(db->aDb[0].pBt);
    pSorter->mnPmaSize = SORTER_MIN_WORKING * pgsz;
    mxCache = db->aDb[0].pSchema->cache_size;
    if( mxCache<SORTER_MIN_WORKING ) mxCache = SORTER_MIN_WORKING;
    pSorter->mxPmaSize = mxCache * pgsz;

    /* If the application is using memsys3 or memsys5, use a separate 
    ** allocation for each sort-key in memory. Otherwise, use a single big
    ** allocation at pSorter->aMemory for all sort-keys.  */
    if( sqlite3GlobalConfig.pHeap==0 ){
      assert( pSorter->iMemory==0 );
      pSorter->nMemory = pgsz;
      pSorter->aMemory = (u8*)sqlite3Malloc(pSorter->nMemory);
      if( !pSorter->aMemory ) return SQLITE_NOMEM;
    }
  }

  return SQLITE_OK;
}

/*
** Free the list of sorted records starting at pRecord.
*/
static void vdbeSorterRecordFree(sqlite3 *db, SorterRecord *pRecord){
  SorterRecord *p;
  SorterRecord *pNext;
  for(p=pRecord; p; p=pNext){
    pNext = p->u.pNext;
    sqlite3DbFree(db, p);
  }
}

/*
** Reset a sorting cursor back to its original empty state.
*/
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
  const VdbeCursor *pCsr,         /* For pKeyInfo */
  SorterRecord *p1,               /* First list to merge */
  SorterRecord *p2,               /* Second list to merge */
  SorterRecord **ppOut            /* OUT: Head of merged list */
){
  SorterRecord *pFinal = 0;
  SorterRecord **pp = &pFinal;
  void *pVal2 = p2 ? p2->pVal : 0;

  while( p1 && p2 ){
    int res;
    vdbeSorterCompare(pCsr, 0, p1->pVal, p1->nVal, pVal2, p2->nVal, &res);
    if( res<=0 ){
      *pp = p1;
      pp = &p1->pNext;
      p1 = p1->pNext;
      pVal2 = 0;
    }else{
      *pp = p2;
       pp = &p2->pNext;
      p2 = p2->pNext;
      if( p2==0 ) break;
      pVal2 = p2->pVal;
    }
  }
  *pp = p1 ? p1 : p2;
  *ppOut = pFinal;
}

/*







|



|


|
|



|
|

|







636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
  const VdbeCursor *pCsr,         /* For pKeyInfo */
  SorterRecord *p1,               /* First list to merge */
  SorterRecord *p2,               /* Second list to merge */
  SorterRecord **ppOut            /* OUT: Head of merged list */
){
  SorterRecord *pFinal = 0;
  SorterRecord **pp = &pFinal;
  void *pVal2 = p2 ? SRVAL(p2) : 0;

  while( p1 && p2 ){
    int res;
    vdbeSorterCompare(pCsr, 0, SRVAL(p1), p1->nVal, pVal2, p2->nVal, &res);
    if( res<=0 ){
      *pp = p1;
      pp = &p1->u.pNext;
      p1 = p1->u.pNext;
      pVal2 = 0;
    }else{
      *pp = p2;
       pp = &p2->u.pNext;
      p2 = p2->u.pNext;
      if( p2==0 ) break;
      pVal2 = SRVAL(p2);
    }
  }
  *pp = p1 ? p1 : p2;
  *ppOut = pFinal;
}

/*
652
653
654
655
656
657
658
659



660







661
662
663
664
665
666
667
  aSlot = (SorterRecord **)sqlite3MallocZero(64 * sizeof(SorterRecord *));
  if( !aSlot ){
    return SQLITE_NOMEM;
  }

  p = pSorter->pRecord;
  while( p ){
    SorterRecord *pNext = p->pNext;



    p->pNext = 0;







    for(i=0; aSlot[i]; i++){
      vdbeSorterMerge(pCsr, p, aSlot[i], &p);
      aSlot[i] = 0;
    }
    aSlot[i] = p;
    p = pNext;
  }







|
>
>
>
|
>
>
>
>
>
>
>







681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
  aSlot = (SorterRecord **)sqlite3MallocZero(64 * sizeof(SorterRecord *));
  if( !aSlot ){
    return SQLITE_NOMEM;
  }

  p = pSorter->pRecord;
  while( p ){
    SorterRecord *pNext;
    if( pSorter->aMemory ){
      assert( p->u.iNext<pSorter->nMemory );
      if( (u8*)p==pSorter->aMemory ){
        pNext = 0;
      }else{
        pNext = (SorterRecord*)&pSorter->aMemory[p->u.iNext];
      }
    }else{
      pNext = p->u.pNext;
    }
    p->u.pNext = 0;
    for(i=0; aSlot[i]; i++){
      vdbeSorterMerge(pCsr, p, aSlot[i], &p);
      aSlot[i] = 0;
    }
    aSlot[i] = p;
    p = pNext;
  }
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875

876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891


892



893
894
895
896
897
898
899
900


901
902


903
904
905


906
907
908
909
910
911
912
913
914
915
916
917

































918
919
920
921
922
923
924
    SorterRecord *p;
    SorterRecord *pNext = 0;

    fileWriterInit(db, pSorter->pTemp1, &writer, pSorter->iWriteOff);
    pSorter->nPMA++;
    fileWriterWriteVarint(&writer, pSorter->nInMemory);
    for(p=pSorter->pRecord; p; p=pNext){
      pNext = p->pNext;
      fileWriterWriteVarint(&writer, p->nVal);
      fileWriterWrite(&writer, p->pVal, p->nVal);
      if( pSorter->aMemory==0 ) sqlite3DbFree(db, p);
    }
    pSorter->pRecord = p;
    rc = fileWriterFinish(db, &writer, &pSorter->iWriteOff);
  }

  if( pSorter->aMemory ) pSorter->pRecord = 0;
  assert( pSorter->pRecord==0 || rc!=SQLITE_OK );
  return rc;
}

/*
** Add a record to the sorter.
*/
int sqlite3VdbeSorterWrite(
  sqlite3 *db,                    /* Database handle */
  const VdbeCursor *pCsr,               /* Sorter cursor */
  Mem *pVal                       /* Memory cell containing record */
){
  VdbeSorter *pSorter = pCsr->pSorter;
  SorterRecord sRecord;           /* Used for aMemory overflow record */
  int rc = SQLITE_OK;             /* Return Code */
  SorterRecord *pNew;             /* New list element */

  assert( pSorter );
  pSorter->nInMemory += sqlite3VarintLen(pVal->n) + pVal->n;

  if( pSorter->aMemory ){
    int nReq = sizeof(SorterRecord) + pVal->n;
    if( (pSorter->iMemory+nReq) > pSorter->mxPmaSize ){
      pNew = &sRecord;
      pNew->pVal = pVal->z;
    }else{
      pNew = &pSorter->aMemory[pSorter->iMemory];
      pSorter->iMemory += ROUND8(nReq);

    }
  }else{
    pNew = (SorterRecord *)sqlite3DbMallocRaw(db, pVal->n+sizeof(SorterRecord));
    if( pNew==0 ){
      return SQLITE_NOMEM;
    }
  }

  if( pNew!=&sRecord ){
    pNew->pVal = (void*)&pNew[1];
    memcpy(pNew->pVal, pVal->z, pVal->n);
  }
  pNew->nVal = pVal->n;
  pNew->pNext = pSorter->pRecord;
  pSorter->pRecord = pNew;



  /* See if the contents of the sorter should now be written out. They



  ** are written out when either of the following are true:
  **
  **   * The total memory allocated for the in-memory list is greater 
  **     than (page-size * cache-size), or
  **
  **   * The total memory allocated for the in-memory list is greater 
  **     than (page-size * 10) and sqlite3HeapNearlyFull() returns true.
  */


  if( pSorter->mxPmaSize>0 ){
    if( (pNew==&sRecord) || (pSorter->aMemory==0 && (


        (pSorter->nInMemory > pSorter->mxPmaSize)
     || (pSorter->nInMemory > pSorter->mnPmaSize && sqlite3HeapNearlyFull())
    ))){


#ifdef SQLITE_DEBUG
      i64 nExpect = pSorter->iWriteOff
        + sqlite3VarintLen(pSorter->nInMemory)
        + pSorter->nInMemory;
#endif
      rc = vdbeSorterListToPMA(db, pCsr);
      pSorter->nInMemory = 0;
      pSorter->iMemory = 0;
      assert( rc!=SQLITE_OK || (nExpect==pSorter->iWriteOff) );
      assert( rc!=SQLITE_OK || pSorter->pRecord==0 );
    }
  }


































  return rc;
}

/*
** Helper function for sqlite3VdbeSorterRewind(). 
*/







|

|




















<



<
<
|
<
|
<
<
<
<
<
<
>
|
<
<
|
<
|
<
|
<
<
<
<
<
<
<
|
>
>
|
>
>
>
|







>
>
|
|
>
>


|
>
>

|
|
|

|
|
|
|
|
|
|
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>
>







870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899

900
901
902


903

904






905
906


907

908

909







910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
    SorterRecord *p;
    SorterRecord *pNext = 0;

    fileWriterInit(db, pSorter->pTemp1, &writer, pSorter->iWriteOff);
    pSorter->nPMA++;
    fileWriterWriteVarint(&writer, pSorter->nInMemory);
    for(p=pSorter->pRecord; p; p=pNext){
      pNext = p->u.pNext;
      fileWriterWriteVarint(&writer, p->nVal);
      fileWriterWrite(&writer, SRVAL(p), p->nVal);
      if( pSorter->aMemory==0 ) sqlite3DbFree(db, p);
    }
    pSorter->pRecord = p;
    rc = fileWriterFinish(db, &writer, &pSorter->iWriteOff);
  }

  if( pSorter->aMemory ) pSorter->pRecord = 0;
  assert( pSorter->pRecord==0 || rc!=SQLITE_OK );
  return rc;
}

/*
** Add a record to the sorter.
*/
int sqlite3VdbeSorterWrite(
  sqlite3 *db,                    /* Database handle */
  const VdbeCursor *pCsr,               /* Sorter cursor */
  Mem *pVal                       /* Memory cell containing record */
){
  VdbeSorter *pSorter = pCsr->pSorter;

  int rc = SQLITE_OK;             /* Return Code */
  SorterRecord *pNew;             /* New list element */



  int bFlush;                     /* True to flush contents of memory to PMA */

  int nReq;                       /* Bytes of memory required */






  int nPMA;                       /* Bytes of PMA space required */



  assert( pSorter );



  /* Figure out whether or not the current contents of memory should be







  ** flushed to a PMA before continuing. If so, do so.
  **
  ** If using the single large allocation mode (pSorter->aMemory!=0), then
  ** flush the contents of memory to a new PMA if (a) at least one value is
  ** already in memory and (b) the new value will not fit in memory.
  ** 
  ** Or, if using separate allocations for each record, flush the contents
  ** of memory to a PMA if either of the following are true:
  **
  **   * The total memory allocated for the in-memory list is greater 
  **     than (page-size * cache-size), or
  **
  **   * The total memory allocated for the in-memory list is greater 
  **     than (page-size * 10) and sqlite3HeapNearlyFull() returns true.
  */
  nReq = pVal->n + sizeof(SorterRecord);
  nPMA = pVal->n + sqlite3VarintLen(pVal->n);
  if( pSorter->aMemory ){
    bFlush = pSorter->iMemory && (pSorter->iMemory+nReq) > pSorter->mxPmaSize;
  }else{
    bFlush = (
        (pSorter->nInMemory > pSorter->mxPmaSize)
     || (pSorter->nInMemory > pSorter->mnPmaSize && sqlite3HeapNearlyFull())
    );
  }
  if( bFlush ){
#ifdef SQLITE_DEBUG
    i64 nExpect = pSorter->iWriteOff
      + sqlite3VarintLen(pSorter->nInMemory)
      + pSorter->nInMemory;
#endif
    rc = vdbeSorterListToPMA(db, pCsr);
    pSorter->nInMemory = 0;
    pSorter->iMemory = 0;
    assert( rc!=SQLITE_OK || (nExpect==pSorter->iWriteOff) );
    assert( rc!=SQLITE_OK || pSorter->pRecord==0 );
  }

  pSorter->nInMemory += nPMA;

  if( pSorter->aMemory ){
    int nMin = pSorter->iMemory + nReq;

    if( nMin>pSorter->nMemory ){
      u8 *aNew;
      int nNew = pSorter->nMemory * 2;
      while( nNew < nMin ) nNew = nNew*2;
      if( nNew > pSorter->mxPmaSize ) nNew = pSorter->mxPmaSize;
      if( nNew < nMin ) nNew = nMin;

      aNew = sqlite3Realloc(pSorter->aMemory, nNew);
      if( !aNew ) return SQLITE_NOMEM;
      pSorter->pRecord = aNew + ((u8*)pSorter->pRecord - pSorter->aMemory);
      pSorter->aMemory = aNew;
      pSorter->nMemory = nNew;
    }

    pNew = (SorterRecord*)&pSorter->aMemory[pSorter->iMemory];
    pSorter->iMemory += ROUND8(nReq);
    pNew->u.iNext = (u8*)(pSorter->pRecord) - pSorter->aMemory;
  }else{
    pNew = (SorterRecord *)sqlite3DbMallocRaw(db, pVal->n+sizeof(SorterRecord));
    if( pNew==0 ){
      return SQLITE_NOMEM;
    }
    pNew->u.pNext = pSorter->pRecord;
  }

  memcpy(SRVAL(pNew), pVal->z, pVal->n);
  pNew->nVal = pVal->n;
  pSorter->pRecord = pNew;

  return rc;
}

/*
** Helper function for sqlite3VdbeSorterRewind(). 
*/
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
          pIter1 = &pSorter->aIter[ pSorter->aTree[i ^ 0x0001] ];
        }
      }
      *pbEof = (pSorter->aIter[pSorter->aTree[1]].pFile==0);
    }
  }else{
    SorterRecord *pFree = pSorter->pRecord;
    pSorter->pRecord = pFree->pNext;
    pFree->pNext = 0;
    if( pSorter->aMemory==0 ){
      vdbeSorterRecordFree(db, pFree);
    }
    *pbEof = !pSorter->pRecord;
    rc = SQLITE_OK;
  }
  return rc;







|
|







1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
          pIter1 = &pSorter->aIter[ pSorter->aTree[i ^ 0x0001] ];
        }
      }
      *pbEof = (pSorter->aIter[pSorter->aTree[1]].pFile==0);
    }
  }else{
    SorterRecord *pFree = pSorter->pRecord;
    pSorter->pRecord = pFree->u.pNext;
    pFree->u.pNext = 0;
    if( pSorter->aMemory==0 ){
      vdbeSorterRecordFree(db, pFree);
    }
    *pbEof = !pSorter->pRecord;
    rc = SQLITE_OK;
  }
  return rc;
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
  if( pSorter->aTree ){
    VdbeSorterIter *pIter;
    pIter = &pSorter->aIter[ pSorter->aTree[1] ];
    *pnKey = pIter->nKey;
    pKey = pIter->aKey;
  }else{
    *pnKey = pSorter->pRecord->nVal;
    pKey = pSorter->pRecord->pVal;
  }
  return pKey;
}

/*
** Copy the current sorter key into the memory cell pOut.
*/







|







1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
  if( pSorter->aTree ){
    VdbeSorterIter *pIter;
    pIter = &pSorter->aIter[ pSorter->aTree[1] ];
    *pnKey = pIter->nKey;
    pKey = pIter->aKey;
  }else{
    *pnKey = pSorter->pRecord->nVal;
    pKey = SRVAL(pSorter->pRecord);
  }
  return pKey;
}

/*
** Copy the current sorter key into the memory cell pOut.
*/