Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Overview
Comment: | Refactor PLWriter to remove owned buffer. DLCollector (Document List Collector) now handles the case where PLWriter (Position List Writer) needed a local buffer. Change to using the associated DLWriter (Document List Writer) buffer, which reduces the number of memory copies needed in doclist processing, and brings PLWriter operation in line with DLWriter operation. (CVS 3707) |
---|---|
Downloads: | Tarball | ZIP archive |
Timelines: | family | ancestors | descendants | both | trunk |
Files: | files | file ages | folders |
SHA1: |
d04fa3a13a84f49074c673b8ee2fb654 |
User & Date: | shess 2007-03-22 00:14:29.000 |
Context
2007-03-22
| ||
15:20 | Call sqlite3_free() instead of free() to release a buffer allocated by sqlite3_vmprintf() in test_async.c (test suite bug only). (CVS 3708) (check-in: b078f09bff user: danielk1977 tags: trunk) | |
00:14 | Refactor PLWriter to remove owned buffer. DLCollector (Document List Collector) now handles the case where PLWriter (Position List Writer) needed a local buffer. Change to using the associated DLWriter (Document List Writer) buffer, which reduces the number of memory copies needed in doclist processing, and brings PLWriter operation in line with DLWriter operation. (CVS 3707) (check-in: d04fa3a13a user: shess tags: trunk) | |
2007-03-20
| ||
23:52 |
Refactor PLWriter in preparation for buffered-document change.
Currently, PLWriter (Position List Writer) creates a locally-owned
DataBuffer to write into. This is necessary to support doclist
collection during tokenization, where there is no obvious buffer to
write output to, but is not necessary for the other users of PLWriter.
This change adds a DLCollector (Doc List Collector) structure to
handle the tokenization case.
Also fix a potential memory leak in writeZeroSegment(). In case of error from leafWriterStep(), the DataBuffer dl was being leaked. (CVS 3706) (check-in: 1b9918e207 user: shess tags: trunk) | |
Changes
Changes to ext/fts2/fts2.c.
︙ | ︙ | |||
686 687 688 689 690 691 692 693 694 695 696 697 698 699 | /* DLWriter is used to write doclist data to a DataBuffer. DLWriter ** always appends to the buffer and does not own it. ** ** dlwInit - initialize to write a given type doclistto a buffer. ** dlwDestroy - clear the writer's memory. Does not free buffer. ** dlwAppend - append raw doclist data to buffer. ** dlwAdd - construct doclist element and append to buffer. */ typedef struct DLWriter { DocListType iType; DataBuffer *b; sqlite_int64 iPrevDocid; } DLWriter; | > | 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 | /* DLWriter is used to write doclist data to a DataBuffer. DLWriter ** always appends to the buffer and does not own it. ** ** dlwInit - initialize to write a given type doclistto a buffer. ** dlwDestroy - clear the writer's memory. Does not free buffer. ** dlwAppend - append raw doclist data to buffer. ** dlwAdd - construct doclist element and append to buffer. ** Only apply dlwAdd() to DL_DOCIDS doclists (else use PLWriter). */ typedef struct DLWriter { DocListType iType; DataBuffer *b; sqlite_int64 iPrevDocid; } DLWriter; |
︙ | ︙ | |||
747 748 749 750 751 752 753 | dataBufferAppend2(pWriter->b, c, nFirstNew, pData+nFirstOld, nData-nFirstOld); }else{ dataBufferAppend(pWriter->b, c, nFirstNew); } pWriter->iPrevDocid = iLastDocid; } | | < | < < < < < < < < < | 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 | dataBufferAppend2(pWriter->b, c, nFirstNew, pData+nFirstOld, nData-nFirstOld); }else{ dataBufferAppend(pWriter->b, c, nFirstNew); } pWriter->iPrevDocid = iLastDocid; } static void dlwAdd(DLWriter *pWriter, sqlite_int64 iDocid){ char c[VARINT_MAX]; int n = putVarint(c, iDocid-pWriter->iPrevDocid); assert( pWriter->iPrevDocid<iDocid ); assert( pWriter->iType==DL_DOCIDS ); dataBufferAppend(pWriter->b, c, n); pWriter->iPrevDocid = iDocid; } /*******************************************************************/ /* PLReader is used to read data from a document's position list. As ** the caller steps through the list, data is cached so that varints ** only need to be decoded once. |
︙ | ︙ | |||
850 851 852 853 854 855 856 | pReader->iEndOffset = pReader->iStartOffset+i; } assert( n<=pReader->nData ); pReader->pData += n; pReader->nData -= n; } | | < | | | > < < < < > > > > | < | | > > > | > > | < < | > | < > > > > | | | | > > | | > > > > | | > > > > > > > > | > > > | > > > | > < > > > > > > > > > > | > > | > | < > > > | > > | > > > > > < < > | < < > > | > | > > > < | 841 842 843 844 845 846 847 848 849 850 851 852 853 854 855 856 857 858 859 860 861 862 863 864 865 866 867 868 869 870 871 872 873 874 875 876 877 878 879 880 881 882 883 884 885 886 887 888 889 890 891 892 893 894 895 896 897 898 899 900 901 902 903 904 905 906 907 908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957 958 959 960 961 962 963 964 965 966 967 968 969 970 971 972 973 974 975 976 977 978 979 980 981 982 983 984 985 986 987 988 989 990 991 992 993 994 995 996 997 998 999 1000 1001 1002 1003 1004 1005 1006 1007 1008 1009 1010 1011 1012 1013 1014 1015 1016 1017 1018 1019 1020 1021 1022 1023 1024 1025 1026 1027 1028 1029 1030 1031 1032 1033 1034 1035 1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 | pReader->iEndOffset = pReader->iStartOffset+i; } assert( n<=pReader->nData ); pReader->pData += n; pReader->nData -= n; } static void plrInit(PLReader *pReader, DLReader *pDLReader){ pReader->pData = dlrPosData(pDLReader); pReader->nData = dlrPosDataLen(pDLReader); pReader->iType = pDLReader->iType; pReader->iColumn = 0; pReader->iPosition = 0; pReader->iStartOffset = 0; pReader->iEndOffset = 0; plrStep(pReader); } static void plrDestroy(PLReader *pReader){ SCRAMBLE(pReader); } /*******************************************************************/ /* PLWriter is used in constructing a document's position list. As a ** convenience, if iType is DL_DOCIDS, PLWriter becomes a no-op. ** PLWriter writes to the associated DLWriter's buffer. ** ** plwInit - init for writing a document's poslist. ** plwDestroy - clear a writer. ** plwAdd - append position and offset information. ** plwTerminate - add any necessary doclist terminator. ** ** Calling plwAdd() after plwTerminate() may result in a corrupt ** doclist. */ /* TODO(shess) Until we've written the second item, we can cache the ** first item's information. Then we'd have three states: ** ** - initialized with docid, no positions. ** - docid and one position. ** - docid and multiple positions. ** ** Only the last state needs to actually write to dlw->b, which would ** be an improvement in the DLCollector case. */ typedef struct PLWriter { DLWriter *dlw; int iColumn; /* the last column written */ int iPos; /* the last position written */ int iOffset; /* the last start offset written */ } PLWriter; /* TODO(shess) In the case where the parent is reading these values ** from a PLReader, we could optimize to a copy if that PLReader has ** the same type as pWriter. */ static void plwAdd(PLWriter *pWriter, int iColumn, int iPos, int iStartOffset, int iEndOffset){ /* Worst-case space for POS_COLUMN, iColumn, iPosDelta, ** iStartOffsetDelta, and iEndOffsetDelta. */ char c[5*VARINT_MAX]; int n = 0; /* Ban plwAdd() after plwTerminate(). */ assert( pWriter->iPos!=-1 ); if( pWriter->dlw->iType==DL_DOCIDS ) return; if( iColumn!=pWriter->iColumn ){ n += putVarint(c+n, POS_COLUMN); n += putVarint(c+n, iColumn); pWriter->iColumn = iColumn; pWriter->iPos = 0; pWriter->iOffset = 0; } assert( iPos>=pWriter->iPos ); n += putVarint(c+n, POS_BASE+(iPos-pWriter->iPos)); pWriter->iPos = iPos; if( pWriter->dlw->iType==DL_POSITIONS_OFFSETS ){ assert( iStartOffset>=pWriter->iOffset ); n += putVarint(c+n, iStartOffset-pWriter->iOffset); pWriter->iOffset = iStartOffset; assert( iEndOffset>=iStartOffset ); n += putVarint(c+n, iEndOffset-iStartOffset); } dataBufferAppend(pWriter->dlw->b, c, n); } static void plwInit(PLWriter *pWriter, DLWriter *dlw, sqlite_int64 iDocid){ char c[VARINT_MAX]; int n; pWriter->dlw = dlw; assert( iDocid>pWriter->dlw->iPrevDocid ); n = putVarint(c, iDocid-pWriter->dlw->iPrevDocid); dataBufferAppend(pWriter->dlw->b, c, n); pWriter->dlw->iPrevDocid = iDocid; pWriter->iColumn = 0; pWriter->iPos = 0; pWriter->iOffset = 0; } /* TODO(shess) Should plwDestroy() also terminate the doclist? But ** then plwDestroy() would no longer be just a destructor, it would ** also be doing work, which isn't consistent with the overall idiom. ** Another option would be for plwAdd() to always append any necessary ** terminator, so that the output is always correct. But that would ** add incremental work to the common case with the only benefit being ** API elegance. Punt for now. */ static void plwTerminate(PLWriter *pWriter){ if( pWriter->dlw->iType>DL_DOCIDS ){ char c[VARINT_MAX]; int n = putVarint(c, POS_END); dataBufferAppend(pWriter->dlw->b, c, n); } #ifndef NDEBUG /* Mark as terminated for assert in plwAdd(). */ pWriter->iPos = -1; #endif } static void plwDestroy(PLWriter *pWriter){ SCRAMBLE(pWriter); } /*******************************************************************/ /* DLCollector wraps PLWriter and DLWriter to provide a ** dynamically-allocated doclist area to use during tokenization. ** ** dlcNew - malloc up and initialize a collector. ** dlcDelete - destroy a collector and all contained items. ** dlcAddPos - append position and offset information. ** dlcAddDoclist - add the collected doclist to the given buffer. */ typedef struct DLCollector { DataBuffer b; DLWriter dlw; PLWriter plw; } DLCollector; /* TODO(shess) This could also be done by calling plwTerminate() and ** dataBufferAppend(). I tried that, expecting nominal performance ** differences, but it seemed to pretty reliably be worth 1% to code ** it this way. I suspect it's the incremental malloc overhead (some ** percentage of the plwTerminate() calls will cause a realloc), so ** this might be worth revisiting if the DataBuffer implementation ** changes. */ static void dlcAddDoclist(DLCollector *pCollector, DataBuffer *b){ if( pCollector->dlw.iType>DL_DOCIDS ){ char c[VARINT_MAX]; int n = putVarint(c, POS_END); dataBufferAppend2(b, pCollector->b.pData, pCollector->b.nData, c, n); }else{ dataBufferAppend(b, pCollector->b.pData, pCollector->b.nData); } } static void dlcAddPos(DLCollector *pCollector, int iColumn, int iPos, int iStartOffset, int iEndOffset){ plwAdd(&pCollector->plw, iColumn, iPos, iStartOffset, iEndOffset); } static DLCollector *dlcNew(sqlite_int64 iDocid, DocListType iType){ DLCollector *pCollector = malloc(sizeof(DLCollector)); dataBufferInit(&pCollector->b, 0); dlwInit(&pCollector->dlw, iType, &pCollector->b); plwInit(&pCollector->plw, &pCollector->dlw, iDocid); return pCollector; } static void dlcDelete(DLCollector *pCollector){ plwDestroy(&pCollector->plw); dlwDestroy(&pCollector->dlw); dataBufferDestroy(&pCollector->b); SCRAMBLE(pCollector); free(pCollector); } /* Copy the doclist data of iType in pData/nData into *out, trimming ** unnecessary data as we go. Only columns matching iColumn are ** copied, all columns copied if iColumn is -1. Elements with no ** matching columns are dropped. The output is an iOutType doclist. */ /* NOTE(shess) This code is only valid after all doclists are merged. ** If this is run before merges, then doclist items which represent ** deletion will be trimmed, and will thus not effect a deletion ** during the merge. */ static void docListTrim(DocListType iType, const char *pData, int nData, int iColumn, DocListType iOutType, DataBuffer *out){ DLReader dlReader; DLWriter dlWriter; assert( iOutType<=iType ); dlrInit(&dlReader, iType, pData, nData); dlwInit(&dlWriter, iOutType, out); while( !dlrAtEnd(&dlReader) ){ PLReader plReader; PLWriter plWriter; int match = 0; plrInit(&plReader, &dlReader); while( !plrAtEnd(&plReader) ){ if( iColumn==-1 || plrColumn(&plReader)==iColumn ){ if( !match ){ plwInit(&plWriter, &dlWriter, dlrDocid(&dlReader)); match = 1; } plwAdd(&plWriter, plrColumn(&plReader), plrPosition(&plReader), plrStartOffset(&plReader), plrEndOffset(&plReader)); } plrStep(&plReader); } if( match ){ plwTerminate(&plWriter); plwDestroy(&plWriter); } plrDestroy(&plReader); dlrStep(&dlReader); } dlwDestroy(&dlWriter); dlrDestroy(&dlReader); } /* Used by docListMerge() to keep doclists in the ascending order by ** docid, then ascending order by age (so the newest comes first). */ |
︙ | ︙ | |||
1168 1169 1170 1171 1172 1173 1174 | PLReader left, right; PLWriter writer; int match = 0; assert( dlrDocid(pLeft)==dlrDocid(pRight) ); assert( pOut->iType!=DL_POSITIONS_OFFSETS ); | | | < > > | > < < < < < | > > > < | 1209 1210 1211 1212 1213 1214 1215 1216 1217 1218 1219 1220 1221 1222 1223 1224 1225 1226 1227 1228 1229 1230 1231 1232 1233 1234 1235 1236 1237 1238 1239 1240 1241 1242 1243 1244 1245 1246 1247 1248 1249 1250 1251 1252 | PLReader left, right; PLWriter writer; int match = 0; assert( dlrDocid(pLeft)==dlrDocid(pRight) ); assert( pOut->iType!=DL_POSITIONS_OFFSETS ); plrInit(&left, pLeft); plrInit(&right, pRight); while( !plrAtEnd(&left) && !plrAtEnd(&right) ){ if( plrColumn(&left)<plrColumn(&right) ){ plrStep(&left); }else if( plrColumn(&left)>plrColumn(&right) ){ plrStep(&right); }else if( plrPosition(&left)+1<plrPosition(&right) ){ plrStep(&left); }else if( plrPosition(&left)+1>plrPosition(&right) ){ plrStep(&right); }else{ if( !match ){ plwInit(&writer, pOut, dlrDocid(pLeft)); match = 1; } plwAdd(&writer, plrColumn(&right), plrPosition(&right), 0, 0); plrStep(&left); plrStep(&right); } } if( match ){ plwTerminate(&writer); plwDestroy(&writer); } plrDestroy(&left); plrDestroy(&right); } /* We have two doclists with positions: pLeft and pRight. ** Write the phrase intersection of these two doclists into pOut. ** ** A phrase intersection means that two documents only match ** if pLeft.iPos+1==pRight.iPos. |
︙ | ︙ | |||
1268 1269 1270 1271 1272 1273 1274 | while( !dlrAtEnd(&left) && !dlrAtEnd(&right) ){ if( dlrDocid(&left)<dlrDocid(&right) ){ dlrStep(&left); }else if( dlrDocid(&right)<dlrDocid(&left) ){ dlrStep(&right); }else{ | | | 1308 1309 1310 1311 1312 1313 1314 1315 1316 1317 1318 1319 1320 1321 1322 | while( !dlrAtEnd(&left) && !dlrAtEnd(&right) ){ if( dlrDocid(&left)<dlrDocid(&right) ){ dlrStep(&left); }else if( dlrDocid(&right)<dlrDocid(&left) ){ dlrStep(&right); }else{ dlwAdd(&writer, dlrDocid(&left)); dlrStep(&left); dlrStep(&right); } } dlrDestroy(&left); dlrDestroy(&right); |
︙ | ︙ | |||
1306 1307 1308 1309 1310 1311 1312 | dlrInit(&left, DL_DOCIDS, pLeft, nLeft); dlrInit(&right, DL_DOCIDS, pRight, nRight); dlwInit(&writer, DL_DOCIDS, pOut); while( !dlrAtEnd(&left) || !dlrAtEnd(&right) ){ if( dlrAtEnd(&right) || dlrDocid(&left)<dlrDocid(&right) ){ | | | | | 1346 1347 1348 1349 1350 1351 1352 1353 1354 1355 1356 1357 1358 1359 1360 1361 1362 1363 1364 1365 1366 | dlrInit(&left, DL_DOCIDS, pLeft, nLeft); dlrInit(&right, DL_DOCIDS, pRight, nRight); dlwInit(&writer, DL_DOCIDS, pOut); while( !dlrAtEnd(&left) || !dlrAtEnd(&right) ){ if( dlrAtEnd(&right) || dlrDocid(&left)<dlrDocid(&right) ){ dlwAdd(&writer, dlrDocid(&left)); dlrStep(&left); }else if( dlrAtEnd(&left) || dlrDocid(&right)<dlrDocid(&left) ){ dlwAdd(&writer, dlrDocid(&right)); dlrStep(&right); }else{ dlwAdd(&writer, dlrDocid(&left)); dlrStep(&left); dlrStep(&right); } } dlrDestroy(&left); dlrDestroy(&right); |
︙ | ︙ | |||
1350 1351 1352 1353 1354 1355 1356 | dlwInit(&writer, DL_DOCIDS, pOut); while( !dlrAtEnd(&left) ){ while( !dlrAtEnd(&right) && dlrDocid(&right)<dlrDocid(&left) ){ dlrStep(&right); } if( dlrAtEnd(&right) || dlrDocid(&left)<dlrDocid(&right) ){ | | | 1390 1391 1392 1393 1394 1395 1396 1397 1398 1399 1400 1401 1402 1403 1404 | dlwInit(&writer, DL_DOCIDS, pOut); while( !dlrAtEnd(&left) ){ while( !dlrAtEnd(&right) && dlrDocid(&right)<dlrDocid(&left) ){ dlrStep(&right); } if( dlrAtEnd(&right) || dlrDocid(&left)<dlrDocid(&right) ){ dlwAdd(&writer, dlrDocid(&left)); } dlrStep(&left); } dlrDestroy(&left); dlrDestroy(&right); dlwDestroy(&writer); |
︙ | ︙ |