Skip to content

Commit eb6a9ad

Browse files
committed
MDEV-26886: Estimation for filtered rows less precise with JSON histogram
- Make Histogram_json_hb::range_selectivity handle singleton buckets specially when computing selectivity of the max. endpoint bound. (for min. endpoint, we already do that). - Also, fixed comments for Histogram_json_hb::find_bucket
1 parent 106c785 commit eb6a9ad

File tree

4 files changed

+84
-29
lines changed

4 files changed

+84
-29
lines changed

mysql-test/main/statistics_json.result

Lines changed: 25 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7658,3 +7658,28 @@ test.t1 analyze status OK
76587658
ALTER TABLE t1 MODIFY f TEXT, ORDER BY pk;
76597659
INSERT INTO t1 (f) VALUES ('bar');
76607660
DROP TABLE t1;
7661+
#
7662+
# MDEV-26886: Estimation for filtered rows less precise with JSON histogram
7663+
#
7664+
create table t1 (a tinyint) as select if(seq%3,seq,0) as a from seq_1_to_100;
7665+
select count(*) from t1 where a <= 0;
7666+
count(*)
7667+
33
7668+
set histogram_type = JSON_HB, histogram_size=default;
7669+
analyze table t1 persistent for all;
7670+
Table Op Msg_type Msg_text
7671+
test.t1 analyze status Engine-independent statistics collected
7672+
test.t1 analyze status OK
7673+
analyze select * from t1 where a <= 0;
7674+
id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra
7675+
1 SIMPLE t1 ALL NULL NULL NULL NULL 100 100.00 33.00 33.00 Using where
7676+
analyze select * from t1 where a < 0;
7677+
id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra
7678+
1 SIMPLE t1 ALL NULL NULL NULL NULL 100 100.00 1.47 0.00 Using where
7679+
analyze select * from t1 where a > 0;
7680+
id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra
7681+
1 SIMPLE t1 ALL NULL NULL NULL NULL 100 100.00 67.00 67.00 Using where
7682+
analyze select * from t1 where a >= 0;
7683+
id select_type table type possible_keys key key_len ref rows r_rows filtered r_filtered Extra
7684+
1 SIMPLE t1 ALL NULL NULL NULL NULL 100 100.00 100.00 100.00 Using where
7685+
drop table t1;

mysql-test/main/statistics_json.test

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -340,3 +340,17 @@ ANALYZE TABLE t1 PERSISTENT FOR ALL;
340340
ALTER TABLE t1 MODIFY f TEXT, ORDER BY pk;
341341
INSERT INTO t1 (f) VALUES ('bar');
342342
DROP TABLE t1;
343+
344+
--echo #
345+
--echo # MDEV-26886: Estimation for filtered rows less precise with JSON histogram
346+
--echo #
347+
create table t1 (a tinyint) as select if(seq%3,seq,0) as a from seq_1_to_100;
348+
select count(*) from t1 where a <= 0;
349+
350+
set histogram_type = JSON_HB, histogram_size=default;
351+
analyze table t1 persistent for all;
352+
analyze select * from t1 where a <= 0;
353+
analyze select * from t1 where a < 0;
354+
analyze select * from t1 where a > 0;
355+
analyze select * from t1 where a >= 0;
356+
drop table t1;

sql/opt_histogram_json.cc

Lines changed: 44 additions & 28 deletions
Original file line numberDiff line numberDiff line change
@@ -743,9 +743,22 @@ double Histogram_json_hb::range_selectivity(Field *field, key_range *min_endp,
743743
idx--;
744744
}
745745
double left_fract= get_left_fract(idx);
746-
double sel= position_in_interval(field, max_key, max_key_len,
747-
buckets[idx].start_value,
748-
get_end_value(idx));
746+
747+
double sel;
748+
/* Special handling for singleton buckets */
749+
if (buckets[idx].ndv == 1 && equal)
750+
{
751+
if (inclusive_endp)
752+
sel= 1.0;
753+
else
754+
sel= 0.0;
755+
}
756+
else
757+
{
758+
sel= position_in_interval(field, max_key, max_key_len,
759+
buckets[idx].start_value,
760+
get_end_value(idx));
761+
}
749762
max= left_fract + sel * (buckets[idx].cum_fract - left_fract);
750763
}
751764
else
@@ -763,26 +776,18 @@ void Histogram_json_hb::serialize(Field *field)
763776

764777

765778
/*
766-
Find the rightmost histogram bucket such that "lookup_val $GT start_value".
767-
768-
$GT is either '>' or '>=' depending on equal_is_less parameter.
769-
770-
@param equal_is_less Controls what to do if a histogram bound is equal to the
771-
lookup_val.
772-
773-
@detail
774-
Possible cases:
775-
1. The regular case: the value falls into some bucket.
779+
@brief
780+
Find the leftmost histogram bucket such that "lookup_val >= start_value".
776781
777-
2. The value is less than the minimum of the first bucket
778-
3. The value is greater than the maximum of the last bucket
779-
In these cases we "clip" to the first/last bucket.
782+
@param field Field object (used to do value comparisons)
783+
@param lookup_val The lookup value in KeyTupleFormat.
784+
@param equal OUT TRUE<=> the found bucket has left_bound=lookup_val
780785
781-
4. The value hits the bucket boundary. Then, we need to know whether the
782-
point of interest is to the left the constant, or to the right of it.
786+
@return
787+
The bucket index
783788
*/
784789

785-
int Histogram_json_hb::find_bucket(Field *field, const uchar *lookup_val,
790+
int Histogram_json_hb::find_bucket(const Field *field, const uchar *lookup_val,
786791
bool *equal)
787792
{
788793
int res;
@@ -797,7 +802,8 @@ int Histogram_json_hb::find_bucket(Field *field, const uchar *lookup_val,
797802
if (!res)
798803
{
799804
*equal= true;
800-
return middle;
805+
low= middle;
806+
goto end;
801807
}
802808
else if (res < 0)
803809
low= middle;
@@ -806,25 +812,25 @@ int Histogram_json_hb::find_bucket(Field *field, const uchar *lookup_val,
806812
}
807813

808814
/*
809-
If low and high were assigned a value in the above loop, then they are not
810-
equal to the lookup value:
815+
If low and high were assigned a value in the above loop and we got here,
816+
then the following holds:
811817
812-
bucket[low] < lookup_val < bucket[high]
818+
bucket[low].start_value < lookup_val < bucket[high].start_value
813819
814-
But there are two special cases: low=0 and high=last_bucket. Handle them
815-
below.
820+
Besides that, there are two special cases: low=0 and high=last_bucket.
821+
Handle them below.
816822
*/
817823
if (low == 0)
818824
{
819825
res= field->key_cmp((uchar*)buckets[0].start_value.data(), lookup_val);
820826
if (!res)
821827
*equal= true;
822-
else if (res < 0)
828+
else if (res < 0) // buckets[0] < lookup_val
823829
{
824830
res= field->key_cmp((uchar*)buckets[high].start_value.data(), lookup_val);
825831
if (!res)
826832
*equal= true;
827-
if (res >= 0)
833+
if (res <= 0) // buckets[high] <= lookup_val
828834
low= high;
829835
}
830836
}
@@ -833,9 +839,19 @@ int Histogram_json_hb::find_bucket(Field *field, const uchar *lookup_val,
833839
res= field->key_cmp((uchar*)buckets[high].start_value.data(), lookup_val);
834840
if (!res)
835841
*equal= true;
836-
if (res >= 0)
842+
if (res <= 0)
837843
low= high;
838844
}
839845

846+
end:
847+
// Verification: *equal==TRUE <=> lookup value is equal to the found bucket.
848+
DBUG_ASSERT(*equal == !(field->key_cmp((uchar*)buckets[low].start_value.data(),
849+
lookup_val)));
850+
// buckets[low] <= lookup_val, with one exception of the first bucket.
851+
DBUG_ASSERT(low == 0 ||
852+
field->key_cmp((uchar*)buckets[low].start_value.data(), lookup_val)<= 0);
853+
// buckets[low+1] > lookup_val, with one exception of the last bucket
854+
DBUG_ASSERT(low == (int)buckets.size()-1 ||
855+
field->key_cmp((uchar*)buckets[low+1].start_value.data(), lookup_val)> 0);
840856
return low;
841857
}

sql/opt_histogram_json.h

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -124,6 +124,6 @@ class Histogram_json_hb : public Histogram_base
124124
private:
125125
double get_left_fract(int idx);
126126
std::string& get_end_value(int idx);
127-
int find_bucket(Field *field, const uchar *lookup_val, bool *equal);
127+
int find_bucket(const Field *field, const uchar *lookup_val, bool *equal);
128128
};
129129

0 commit comments

Comments
 (0)