Many hyperlinks are disabled.
Use anonymous login
to enable hyperlinks.
Overview
Comment: | Delta-encode docids. This is good for around 22% reduction in index size with DL_POSITIONS. It improves performance about 5%-6%. (CVS 3511) |
---|---|
Downloads: | Tarball | ZIP archive |
Timelines: | family | ancestors | descendants | both | trunk |
Files: | files | file ages | folders |
SHA1: |
9b6d413d751d962b67cb4e3a208efe61 |
User & Date: | shess 2006-11-13 21:09:25.000 |
Context
2006-11-17
| ||
21:12 | Refactoring groundwork for coming work on interior nodes. Change LeafWriter to use empty data buffer (instead of empty term) to detect an empty block. Code to validate interior nodes. Moderate revisions to leaf-node and doclist validation. Recast leafWriterStep() in terms of LeafWriterStepMerge(). (CVS 3512) (check-in: f30771d5c7 user: shess tags: trunk) | |
2006-11-13
| ||
21:09 | Delta-encode docids. This is good for around 22% reduction in index size with DL_POSITIONS. It improves performance about 5%-6%. (CVS 3511) (check-in: 9b6d413d75 user: shess tags: trunk) | |
21:00 | Require a minimum fanout for interior nodes. This prevents cases where excessively large terms keep the tree from finding a single root. A downside is that this could result in large interior nodes in the presence of large terms, which may be prone to fragmentation, though if the nodes were smaller that would translate into more levels in the tree, which would also have that problem. (CVS 3510) (check-in: 64b7e34061 user: shess tags: trunk) | |
Changes
Changes to ext/fts2/fts2.c.
︙ | ︙ | |||
601 602 603 604 605 606 607 | /* Skip past current doclist element. */ assert( pReader->nElement<=pReader->nData ); pReader->pData += pReader->nElement; pReader->nData -= pReader->nElement; /* If there is more data, read the next doclist element. */ if( pReader->nData!=0 ){ | > | > | 601 602 603 604 605 606 607 608 609 610 611 612 613 614 615 616 617 | /* Skip past current doclist element. */ assert( pReader->nElement<=pReader->nData ); pReader->pData += pReader->nElement; pReader->nData -= pReader->nElement; /* If there is more data, read the next doclist element. */ if( pReader->nData!=0 ){ sqlite_int64 iDocidDelta; int iDummy, n = getVarint(pReader->pData, &iDocidDelta); pReader->iDocid += iDocidDelta; if( pReader->iType>=DL_POSITIONS ){ assert( n<pReader->nData ); while( 1 ){ n += getVarint32(pReader->pData+n, &iDummy); assert( n<=pReader->nData ); if( iDummy==POS_END ) break; if( iDummy==POS_COLUMN ){ |
︙ | ︙ | |||
645 646 647 648 649 650 651 | #ifndef NDEBUG /* Verify that the doclist can be validly decoded. Also returns the ** last docid found because it's convenient in other assertions for ** DLWriter. */ static int docListValidate(DocListType iType, const char *pData, int nData, sqlite_int64 *pLastDocid){ | < | < | | < < | < < < < < < < < < < < < < > > > > > > > > > > > > > | > > > > | > | > | | | | > > > > | > | > > > > | > | | > > | | < < < | 647 648 649 650 651 652 653 654 655 656 657 658 659 660 661 662 663 664 665 666 667 668 669 670 671 672 673 674 675 676 677 678 679 680 681 682 683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700 701 702 703 704 705 706 707 708 709 710 711 712 713 714 715 716 717 718 719 720 721 722 723 724 725 726 727 728 729 730 731 732 733 734 735 736 737 738 739 740 741 742 743 744 745 746 747 748 749 750 751 752 753 754 755 756 757 758 759 760 761 762 763 764 765 766 767 768 769 770 771 772 773 774 775 776 777 778 | #ifndef NDEBUG /* Verify that the doclist can be validly decoded. Also returns the ** last docid found because it's convenient in other assertions for ** DLWriter. */ static int docListValidate(DocListType iType, const char *pData, int nData, sqlite_int64 *pLastDocid){ sqlite_int64 iPrevDocid = 0; assert( pData!=0 ); assert( nData!=0 ); while( nData!=0 ){ sqlite_int64 iDocidDelta; int n = getVarint(pData, &iDocidDelta); iPrevDocid += iDocidDelta; if( iType>DL_DOCIDS ){ int iDummy; while( 1 ){ n += getVarint32(pData+n, &iDummy); if( iDummy==POS_END ) break; if( iDummy==POS_COLUMN ){ n += getVarint32(pData+n, &iDummy); }else if( iType>DL_POSITIONS ){ n += getVarint32(pData+n, &iDummy); n += getVarint32(pData+n, &iDummy); } assert( n<=nData ); } } assert( n<=nData ); pData += n; nData -= n; } if( pLastDocid ) *pLastDocid = iPrevDocid; return 1; } #endif /*******************************************************************/ /* DLWriter is used to write doclist data to a DataBuffer. DLWriter ** always appends to the buffer and does not own it. ** ** dlwInit - initialize to write a given type doclistto a buffer. ** dlwDestroy - clear the writer's memory. Does not free buffer. ** dlwAppend - append raw doclist data to buffer. ** dlwAdd - construct doclist element and append to buffer. */ typedef struct DLWriter { DocListType iType; DataBuffer *b; sqlite_int64 iPrevDocid; } DLWriter; static void dlwInit(DLWriter *pWriter, DocListType iType, DataBuffer *b){ pWriter->b = b; pWriter->iType = iType; pWriter->iPrevDocid = 0; } static void dlwDestroy(DLWriter *pWriter){ SCRAMBLE(pWriter); } /* iFirstDocid is the first docid in the doclist in pData. It is ** needed because pData may point within a larger doclist, in which ** case the first item would be delta-encoded. ** ** iLastDocid is the final docid in the doclist in pData. It is ** needed to create the new iPrevDocid for future delta-encoding. The ** code could decode the passed doclist to recreate iLastDocid, but ** the only current user (docListMerge) already has decoded this ** information. */ /* TODO(shess) This has become just a helper for docListMerge. ** Consider a refactor to make this cleaner. */ static void dlwAppend(DLWriter *pWriter, const char *pData, int nData, sqlite_int64 iFirstDocid, sqlite_int64 iLastDocid){ sqlite_int64 iDocid = 0; char c[VARINT_MAX]; int nFirstOld, nFirstNew; /* Old and new varint len of first docid. */ #ifndef NDEBUG sqlite_int64 iLastDocidDelta; #endif /* Recode the initial docid as delta from iPrevDocid. */ nFirstOld = getVarint(pData, &iDocid); assert( nFirstOld<nData || (nFirstOld==nData && pWriter->iType==DL_DOCIDS) ); nFirstNew = putVarint(c, iFirstDocid-pWriter->iPrevDocid); /* Verify that the incoming doclist is valid AND that it ends with ** the expected docid. This is essential because we'll trust this ** docid in future delta-encoding. */ assert( docListValidate(pWriter->iType, pData, nData, &iLastDocidDelta) ); assert( iLastDocid==iFirstDocid-iDocid+iLastDocidDelta ); /* Append recoded initial docid and everything else. Rest of docids ** should have been delta-encoded from previous initial docid. */ if( nFirstOld<nData ){ dataBufferAppend2(pWriter->b, c, nFirstNew, pData+nFirstOld, nData-nFirstOld); }else{ dataBufferAppend(pWriter->b, c, nFirstNew); } pWriter->iPrevDocid = iLastDocid; } static void dlwAdd(DLWriter *pWriter, sqlite_int64 iDocid, const char *pPosList, int nPosList){ char c[VARINT_MAX]; int n = putVarint(c, iDocid-pWriter->iPrevDocid); assert( pWriter->iPrevDocid<iDocid ); assert( pPosList==0 || pWriter->iType>DL_DOCIDS ); dataBufferAppend(pWriter->b, c, n); if( pWriter->iType>DL_DOCIDS ){ n = putVarint(c, 0); if( nPosList>0 ){ dataBufferAppend2(pWriter->b, pPosList, nPosList, c, n); }else{ dataBufferAppend(pWriter->b, c, n); } } pWriter->iPrevDocid = iDocid; } /*******************************************************************/ /* PLReader is used to read data from a document's position list. As ** the caller steps through the list, data is cached so that varints ** only need to be decoded once. ** |
︙ | ︙ | |||
1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 | static void docListMerge(DataBuffer *out, DLReader *pReaders, int nReaders){ OrderedDLReader readers[MERGE_COUNT]; DLWriter writer; int i, n; const char *pStart = 0; int nStart = 0; assert( nReaders>0 ); if( nReaders==1 ){ dataBufferAppend(out, dlrDocData(pReaders), dlrAllDataBytes(pReaders)); return; } | > | 1056 1057 1058 1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 | static void docListMerge(DataBuffer *out, DLReader *pReaders, int nReaders){ OrderedDLReader readers[MERGE_COUNT]; DLWriter writer; int i, n; const char *pStart = 0; int nStart = 0; sqlite_int64 iFirstDocid = 0, iLastDocid = 0; assert( nReaders>0 ); if( nReaders==1 ){ dataBufferAppend(out, dlrDocData(pReaders), dlrAllDataBytes(pReaders)); return; } |
︙ | ︙ | |||
1079 1080 1081 1082 1083 1084 1085 | /* If this is a continuation of the current buffer to copy, extend ** that buffer. memcpy() seems to be more efficient if it has a ** lots of data to copy. */ if( dlrDocData(readers[0].pReader)==pStart+nStart ){ nStart += dlrDocDataBytes(readers[0].pReader); }else{ | | > > > > | | 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129 1130 1131 | /* If this is a continuation of the current buffer to copy, extend ** that buffer. memcpy() seems to be more efficient if it has a ** lots of data to copy. */ if( dlrDocData(readers[0].pReader)==pStart+nStart ){ nStart += dlrDocDataBytes(readers[0].pReader); }else{ if( pStart!=0 ){ dlwAppend(&writer, pStart, nStart, iFirstDocid, iLastDocid); } pStart = dlrDocData(readers[0].pReader); nStart = dlrDocDataBytes(readers[0].pReader); iFirstDocid = iDocid; } iLastDocid = iDocid; dlrStep(readers[0].pReader); /* Drop all of the older elements with the same docid. */ for(i=1; i<nReaders && !dlrAtEnd(readers[i].pReader) && dlrDocid(readers[i].pReader)==iDocid; i++){ dlrStep(readers[i].pReader); } /* Get the readers back into order. */ while( i-->0 ){ orderedDLReaderReorder(readers+i, nReaders-i); } } /* Copy over any remaining elements. */ if( nStart>0 ) dlwAppend(&writer, pStart, nStart, iFirstDocid, iLastDocid); dlwDestroy(&writer); } /* pLeft and pRight are DLReaders positioned to the same docid. ** ** If there are no instances in pLeft or pRight where the position ** of pLeft is one less than the position of pRight, then this |
︙ | ︙ | |||
4151 4152 4153 4154 4155 4156 4157 4158 4159 4160 4161 4162 4163 4164 | dataBufferAppendLenData(&pWriter->data, pData, nData); /* Flush standalone blocks right out */ if( nData+nTerm>STANDALONE_MIN ){ rc = leafWriterFlush(v, pWriter); if( rc!=SQLITE_OK ) return rc; } return SQLITE_OK; } /* Used to avoid a memmove when a large amount of doclist data is in ** the buffer. This constructs a node and term header before ** iDoclistData and flushes the resulting complete node using | > | 4169 4170 4171 4172 4173 4174 4175 4176 4177 4178 4179 4180 4181 4182 4183 | dataBufferAppendLenData(&pWriter->data, pData, nData); /* Flush standalone blocks right out */ if( nData+nTerm>STANDALONE_MIN ){ rc = leafWriterFlush(v, pWriter); if( rc!=SQLITE_OK ) return rc; } assert( leafNodeValidate(pWriter->data.pData, pWriter->data.nData) ); return SQLITE_OK; } /* Used to avoid a memmove when a large amount of doclist data is in ** the buffer. This constructs a node and term header before ** iDoclistData and flushes the resulting complete node using |
︙ | ︙ |