Many hyperlinks are disabled.
Use anonymous login to enable hyperlinks.

Overview
Comment:Change the data encoding (again) to make content-in-key use fewer bytes, since one suspects that this will become a common encoding. FILE FORMAT CHANGE.
Downloads: Tarball | ZIP archive
Timelines: family | ancestors | descendants | both | trunk
Files: files | file ages | folders
SHA1: 0275ee48db5393cd4229434f784d91f1ff9567c5
User & Date: drh 2013-07-26 16:20:07.396
Context
2013-07-26
16:59
Fix minor errors in the key encoder. check-in: 0a923f20d2 user: drh tags: trunk
16:20
Change the data encoding (again) to make content-in-key use fewer bytes, since one suspects that this will become a common encoding. FILE FORMAT CHANGE. check-in: 0275ee48db user: drh tags: trunk
15:30
Add decoder logic to facilitate extracting column values from the key of a key/value pair for a row. check-in: 30167422e4 user: drh tags: trunk
Changes
Unified Diff Ignore Whitespace Patch
Changes to src/vdbecodec.c.
462
463
464
465
466
467
468
469
470
471




472
473
474
475
476

477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
  ofst += n;
  endHdr = ofst;
  if( endHdr>p->n ) return SQLITE4_CORRUPT;
  for(i=0; i<=iVal && n<endHdr; i++){
    sz = sqlite4GetVarint64(p->a+n, p->n-n, &type);
    if( sz==0 ) return SQLITE4_CORRUPT;
    n += sz;
    if( type>=24 ){
      cclass = (type-24)%3;
      if( cclass==2 ){




         sz = sqlite4GetVarint64(p->a+n, p->n-n, &subtype);
         if( sz==0 ) return SQLITE4_CORRUPT;
         n += sz;
      }
      size = (type-24)/3;

    }else if( type<=2 ){
      size = 0;
    }else if( type<=10 ){
      size = type - 2;
    }else if( type<=21 ){
      size = type - 9;
    }else{
      /* value in key */
      size = 0;
      sz = sqlite4GetVarint64(p->a+n, p->n-n, &subtype);
      if( sz==0 ) return SQLITE4_CORRUPT;
      n += sz;
    }
    if( i<iVal ){
      ofst += size;
    }else if( type==0 ){
      /* no-op */
    }else if( type<=2 ){
      sqlite4VdbeMemSetInt64(pOut, type-1);







|
|

>
>
>
>
|
|
|
|
<
>
|

|

<
<

|
|
<
<
<







462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479

480
481
482
483
484


485
486
487



488
489
490
491
492
493
494
  ofst += n;
  endHdr = ofst;
  if( endHdr>p->n ) return SQLITE4_CORRUPT;
  for(i=0; i<=iVal && n<endHdr; i++){
    sz = sqlite4GetVarint64(p->a+n, p->n-n, &type);
    if( sz==0 ) return SQLITE4_CORRUPT;
    n += sz;
    if( type>=22 ){  /* STRING, BLOB, KEY, and TYPED */
      cclass = (type-22)%4;
      if( cclass==2 ){
        size = 0;  /* KEY */
      }else{
        size = (type-22)/4;
        if( cclass==3 ){  /* The TYPED header code */
          sz = sqlite4GetVarint64(p->a+n, p->n-n, &subtype);
          if( sz==0 ) return SQLITE4_CORRUPT;
          n += sz;
        }

      }
    }else if( type<=2 ){  /* NULL, ZERO, and ONE */
      size = 0;
    }else if( type<=10 ){ /* INT */
      size = type - 2;


    }else{
      assert( type>=11 && type<=21 );  /* NUM */
      size = type - 9;



    }
    if( i<iVal ){
      ofst += size;
    }else if( type==0 ){
      /* no-op */
    }else if( type<=2 ){
      sqlite4VdbeMemSetInt64(pOut, type-1);
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531



532
533
534
535
536
537
538

      num.m = x;
      num.e = (e >> 2);
      if( e & 0x02 ) num.e = -1 * num.e;
      if( e & 0x01 ) num.sign = 1;
      pOut->u.num = num;
      MemSetTypeFlag(pOut, MEM_Real);
    }else if( type<=23 ){
      return decoderFromKey(p, type==23, subtype, pOut);
    }else if( cclass==0 ){
      if( size==0 ){
        sqlite4VdbeMemSetStr(pOut, "", 0, SQLITE4_UTF8, SQLITE4_TRANSIENT, 0);
      }else if( p->a[ofst]>0x02 ){
        sqlite4VdbeMemSetStr(pOut, (char*)(p->a+ofst), size, 
                             SQLITE4_UTF8, SQLITE4_TRANSIENT, 0);
      }else{
        static const u8 enc[] = {SQLITE4_UTF8,SQLITE4_UTF16LE,SQLITE4_UTF16BE };
        sqlite4VdbeMemSetStr(pOut, (char*)(p->a+ofst+1), size-1, 
                             enc[p->a[ofst]], SQLITE4_TRANSIENT, 0);
      }



    }else{
      sqlite4VdbeMemSetStr(pOut, (char*)(p->a+ofst), size, 0,
                           SQLITE4_TRANSIENT, 0);
      pOut->enc = ENC(p->db);
    }
  }
  testcase( i==iVal );







<
<











>
>
>







511
512
513
514
515
516
517


518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538

      num.m = x;
      num.e = (e >> 2);
      if( e & 0x02 ) num.e = -1 * num.e;
      if( e & 0x01 ) num.sign = 1;
      pOut->u.num = num;
      MemSetTypeFlag(pOut, MEM_Real);


    }else if( cclass==0 ){
      if( size==0 ){
        sqlite4VdbeMemSetStr(pOut, "", 0, SQLITE4_UTF8, SQLITE4_TRANSIENT, 0);
      }else if( p->a[ofst]>0x02 ){
        sqlite4VdbeMemSetStr(pOut, (char*)(p->a+ofst), size, 
                             SQLITE4_UTF8, SQLITE4_TRANSIENT, 0);
      }else{
        static const u8 enc[] = {SQLITE4_UTF8,SQLITE4_UTF16LE,SQLITE4_UTF16BE };
        sqlite4VdbeMemSetStr(pOut, (char*)(p->a+ofst+1), size-1, 
                             enc[p->a[ofst]], SQLITE4_TRANSIENT, 0);
      }
    }else if( cclass==2 ){
      unsigned int k = (type - 24)/4;
      return decoderFromKey(p, (k&1)!=0, k/2, pOut);
    }else{
      sqlite4VdbeMemSetStr(pOut, (char*)(p->a+ofst), size, 0,
                           SQLITE4_TRANSIENT, 0);
      pOut->enc = ENC(p->db);
    }
  }
  testcase( i==iVal );
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
      aAux[i].n = n;
      aOut[nOut++] = n+9;
      nPayload += n;
    }else if( flags & MEM_Str ){
      n = pIn->n;
      if( n && (encoding!=SQLITE4_UTF8 || pIn->z[0]<3) ) n++;
      nPayload += n;
      nOut += sqlite4PutVarint64(aOut+nOut, 24+3*(sqlite4_int64)n);
    }else{
      n = pIn->n;
      assert( flags & MEM_Blob );
      nPayload += n;
      nOut += sqlite4PutVarint64(aOut+nOut, 25+3*(sqlite4_int64)n);
    }
  }
  nHdr = nOut - 9;
  n = sqlite4PutVarint64(aOut, nHdr);
  for(i=n, j=9; j<nOut; j++) aOut[i++] = aOut[j];
  nOut = i;
  aOut = sqlite4DbReallocOrFree(db, aOut, nOut + nPayload);







|




|







629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
      aAux[i].n = n;
      aOut[nOut++] = n+9;
      nPayload += n;
    }else if( flags & MEM_Str ){
      n = pIn->n;
      if( n && (encoding!=SQLITE4_UTF8 || pIn->z[0]<3) ) n++;
      nPayload += n;
      nOut += sqlite4PutVarint64(aOut+nOut, 22+4*(sqlite4_int64)n);
    }else{
      n = pIn->n;
      assert( flags & MEM_Blob );
      nPayload += n;
      nOut += sqlite4PutVarint64(aOut+nOut, 23+4*(sqlite4_int64)n);
    }
  }
  nHdr = nOut - 9;
  n = sqlite4PutVarint64(aOut, nHdr);
  for(i=n, j=9; j<nOut; j++) aOut[i++] = aOut[j];
  nOut = i;
  aOut = sqlite4DbReallocOrFree(db, aOut, nOut + nPayload);
Changes to www/data_encoding.wiki.
11
12
13
14
15
16
17


18
19
20
21
22
23
24
25

26
27
28
29



30
31
32
33

34








35
36
37
38
39



40
41
42
43
44
45
46
47
The data consists of a header area followed by a content area.  The data
begins with a single [./varint.wiki | varint] which is the size of the header area.  The
initial varint itself is not considered part of the header.  The header
is composed of one or two varints for each column in the table.  The varints
determines the datatype and size of the value for that column:

<blockquote><table border=1 cellpadding=5>


<tr><td>  0       <td>   NULL
<tr><td>  1       <td>   zero
<tr><td>  2       <td>   one
<tr><td>  3..10   <td>   (N-2)-byte signed integer
<tr><td>  11..21  <td>   (N-9)-byte number (two varints: min 2, max 12 bytes)
<tr><td>  22      <td>   non-REAL value in key, followed by varint key offset
<tr><td>  23      <td>   REAL value in key, followed by varint key offset
<tr><td>  24+3*K  <td>   K-byte string

<tr><td>  25+3*K  <td>   K-byte inline blob
<tr><td>  26+3*K  <td>   K-byte typed blob, followed by a single varint type code
</table></blockquote>




Strings can be either UTF8, UTF16le, or UTF16be.  If the first byte of the
payload is 0x00, 0x01, or 0x02 then that byte is ignored and the remaining
bytes are UTF8, UTF16le, or UTF16be respectively.  If the first byte is 0x03
or larger, then the entire string including the first byte is UTF8.










A "typed blob" is a sequence of bytes in an application-defined type.
The type is determined by a varint that immediately follows the initial
varint.  Hence, a typed blob uses two varints in the header whereas all
other types use a single varint.




The content of a number is two varints.  The first varint has a value
which is abs(e)*4 + (e<0)*2 + (m<0).  The second varint is abs(m).
The maximum e is 999, which gives a max varint value of 3999 or 0xf906af, for
a maximum first varint size of 3.  Values of e greater than 999 (used for
Inf and NaN) are represented as a -0.  The second varint can be a full 9 bytes.
Example values:

<blockquote><table border=0>







>
>
|
|
|
|
|
|
|
|
>
|
|


>
>
>
|


|
>

>
>
>
>
>
>
>
>
|




>
>
>
|







11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
The data consists of a header area followed by a content area.  The data
begins with a single [./varint.wiki | varint] which is the size of the header area.  The
initial varint itself is not considered part of the header.  The header
is composed of one or two varints for each column in the table.  The varints
determines the datatype and size of the value for that column:

<blockquote><table border=1 cellpadding=5>
<tr><th> Encoding<br>Name <th> Header<br>Code (N)<th> Bytes of <br> Payload
    <th>Description
<tr><td>NULL<td>   0       <td> 0   <td> NULL
<tr><td>ZERO<td>   1       <td> 0   <td> Zero
<tr><td>ONE<td>    2       <td> 0   <td> One
<tr><td>INT<td>    3..10   <td> N-2 <td> Signed integer
<tr><td>NUM<td>    11..21  <td> N-9 <td> Floating-point number
<tr><td>STRING<td> 22+4*K  <td> K   <td> String
<tr><td>BLOB<td>   23+4*K  <td> K   <td> Inline blob
<tr><td>KEY<td>    24+4*K  <td> 0   <td> Content in key at offset K/2.
                                         Floating-point if LSB of K is 1.
<tr><td>TYPED<td>  25+4*K  <td> K   <td> Typed blob, followed by a
                                         single varint type code
</table></blockquote>

Header codes NULL, ZERO, and ONE are self describing and have no content in the 
payload area.

Strings (STRING) can be either UTF8, UTF16le, or UTF16be.  If the first byte of the
payload is 0x00, 0x01, or 0x02 then that byte is ignored and the remaining
bytes are UTF8, UTF16le, or UTF16be respectively.  If the first byte is 0x03
or larger, then the entire string including the first byte is UTF8.  An empty
string consists of the header code 22 and no payload.

Blobs (BLOB) are stored as a sequence of bytes with no encoding.  An empty blob
is header code 23.  A one-byte blob is header code 27.  And so forth.

The KEY header code indicates that the actually content of the column is in the
key portion of the key/value pair at an offset of K/2 bytes from the beginning of
the key.  Numeric values should be interpreted as floating point if K is odd and
as integers if K is even.

A "typed blob" (TYPED) is a sequence of bytes in an application-defined type.
The type is determined by a varint that immediately follows the initial
varint.  Hence, a typed blob uses two varints in the header whereas all
other types use a single varint.

The content of INT is the specified number of bytes for the signed integer.
The most significant bytes are first.

The content of a number (NUM) is two varints.  The first varint has a value
which is abs(e)*4 + (e<0)*2 + (m<0).  The second varint is abs(m).
The maximum e is 999, which gives a max varint value of 3999 or 0xf906af, for
a maximum first varint size of 3.  Values of e greater than 999 (used for
Inf and NaN) are represented as a -0.  The second varint can be a full 9 bytes.
Example values:

<blockquote><table border=0>