diff mbox series

[v1,4/8] x86: Optimize memrchr-sse2.S

Message ID 20220603044229.2180216-4-goldstein.w.n@gmail.com
State New
Headers show
Series [v1,1/8] x86: Create header for VEC classes in x86 strings library | expand

Commit Message

Noah Goldstein June 3, 2022, 4:42 a.m. UTC
The new code:
    1. prioritizes smaller lengths more.
    2. optimizes target placement more carefully.
    3. reuses logic more.
    4. fixes up various inefficiencies in the logic.

The total code size saving is: 394 bytes
Geometric Mean of all benchmarks New / Old: 0.874

Regressions:
    1. The page cross case is now colder, especially re-entry from the
       page cross case if a match is not found in the first VEC
       (roughly 50%). My general opinion with this patch is this is
       acceptable given the "coldness" of this case (less than 4%) and
       generally performance improvement in the other far more common
       cases.

    2. There are some regressions 5-15% for medium/large user-arg
       lengths that have a match in the first VEC. This is because the
       logic was rewritten to optimize finds in the first VEC if the
       user-arg length is shorter (where we see roughly 20-50%
       performance improvements). It is not always the case this is a
       regression. My intuition is some frontend quirk is partially
       explaining the data although I haven't been able to find the
       root cause.

Full xcheck passes on x86_64.
---
 sysdeps/x86_64/memrchr.S | 613 +++++++++++++++++++--------------------
 1 file changed, 292 insertions(+), 321 deletions(-)

Comments

Noah Goldstein June 3, 2022, 4:47 a.m. UTC | #1
On Thu, Jun 2, 2022 at 11:42 PM Noah Goldstein <goldstein.w.n@gmail.com> wrote:
>
> The new code:
>     1. prioritizes smaller lengths more.
>     2. optimizes target placement more carefully.
>     3. reuses logic more.
>     4. fixes up various inefficiencies in the logic.
>
> The total code size saving is: 394 bytes
> Geometric Mean of all benchmarks New / Old: 0.874
>
> Regressions:
>     1. The page cross case is now colder, especially re-entry from the
>        page cross case if a match is not found in the first VEC
>        (roughly 50%). My general opinion with this patch is this is
>        acceptable given the "coldness" of this case (less than 4%) and
>        generally performance improvement in the other far more common
>        cases.
>
>     2. There are some regressions 5-15% for medium/large user-arg
>        lengths that have a match in the first VEC. This is because the
>        logic was rewritten to optimize finds in the first VEC if the
>        user-arg length is shorter (where we see roughly 20-50%
>        performance improvements). It is not always the case this is a
>        regression. My intuition is some frontend quirk is partially
>        explaining the data although I haven't been able to find the
>        root cause.
>
> Full xcheck passes on x86_64.
> ---

Least confident with numbers in this patch.

Geometric mean of N = 30 runs.
Benchmarked on Tigerlake:
https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i71165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html

Aggregate Geometric Mean of New / Old: 0.8743388468654057

Results For: memrchr
 len, align,  pos, seek_char, invert_pos,  New / Old
2048,     0,   32,        23,          0,      0.993
 256,     1,   64,        23,          0,      0.903
2048,     0,   32,         0,          0,       0.89
 256,     1,   64,         0,          0,      0.904
 256,  4081,   64,         0,          0,      0.907
 256,     0,    1,        23,          0,       0.95
 256,     0,    1,         0,          0,       0.95
 256,     1,    1,        23,          0,      0.885
 256,     1,    1,         0,          0,      0.883
2048,     0,   64,        23,          0,        0.8
 256,     2,   64,        23,          0,      0.905
2048,     0,   64,         0,          0,      0.795
 256,     2,   64,         0,          0,      0.905
 256,     0,    2,        23,          0,      0.949
 256,     0,    2,         0,          0,      0.949
 256,     2,    2,        23,          0,      0.885
 256,     2,    2,         0,          0,      0.886
2048,     0,  128,        23,          0,      0.781
 256,     3,   64,        23,          0,      0.904
2048,     0,  128,         0,          0,      0.804
 256,     3,   64,         0,          0,      0.904
 256,     0,    3,        23,          0,      0.948
 256,     0,    3,         0,          0,      0.948
 256,     3,    3,        23,          0,      0.886
 256,     3,    3,         0,          0,      0.881
2048,     0,  256,        23,          0,      0.715
 256,     4,   64,        23,          0,      0.896
2048,     0,  256,         0,          0,      0.747
 256,     4,   64,         0,          0,      0.897
 256,     0,    4,        23,          0,      0.948
 256,     0,    4,         0,          0,       0.95
 256,     4,    4,        23,          0,      0.884
 256,     4,    4,         0,          0,      0.885
2048,     0,  512,        23,          0,       0.66
 256,     5,   64,        23,          0,      0.905
2048,     0,  512,         0,          0,      0.674
 256,     5,   64,         0,          0,      0.905
 256,     0,    5,        23,          0,      0.951
 256,     0,    5,         0,          0,       0.95
 256,     5,    5,        23,          0,      0.885
 256,     5,    5,         0,          0,      0.883
2048,     0, 1024,        23,          0,      0.952
 256,     6,   64,        23,          0,      0.905
2048,     0, 1024,         0,          0,      0.952
 256,     6,   64,         0,          0,      0.904
 256,     0,    6,        23,          0,       0.95
 256,     0,    6,         0,          0,       0.95
 256,     6,    6,        23,          0,      0.884
 256,     6,    6,         0,          0,      0.884
2048,     0, 2048,        23,          0,      0.843
 256,     7,   64,        23,          0,      0.904
2048,     0, 2048,         0,          0,      0.839
 256,     7,   64,         0,          0,      0.906
 256,     0,    7,        23,          0,      0.951
 256,     0,    7,         0,          0,      0.951
 256,     7,    7,        23,          0,      0.887
 256,     7,    7,         0,          0,      0.885
 192,     1,   32,        23,          0,      0.867
 192,     1,   32,         0,          0,      0.866
 256,     1,   32,        23,          0,      0.888
 256,     1,   32,         0,          0,      0.888
 512,     1,   32,        23,          0,      1.103
 512,     1,   32,         0,          0,      1.102
 256,  4081,   32,        23,          0,      0.924
 192,     2,   64,        23,          0,      1.081
 192,     2,   64,         0,          0,      1.081
 512,     2,   64,        23,          0,      1.131
 512,     2,   64,         0,          0,      1.129
 256,  4081,   64,        23,          0,      0.905
 192,     3,   96,        23,          0,      1.174
 192,     3,   96,         0,          0,      1.174
 256,     3,   96,        23,          0,       0.73
 256,     3,   96,         0,          0,       0.73
 512,     3,   96,        23,          0,      0.755
 512,     3,   96,         0,          0,      0.757
 256,  4081,   96,        23,          0,      0.835
 192,     4,  128,        23,          0,      0.898
 192,     4,  128,         0,          0,      0.895
 256,     4,  128,        23,          0,      1.081
 256,     4,  128,         0,          0,      1.082
 512,     4,  128,        23,          0,      1.088
 512,     4,  128,         0,          0,      1.087
 256,  4081,  128,        23,          0,      1.252
 192,     5,  160,        23,          0,      0.894
 192,     5,  160,         0,          0,      0.894
 256,     5,  160,        23,          0,      1.174
 256,     5,  160,         0,          0,      1.174
 512,     5,  160,        23,          0,      1.093
 512,     5,  160,         0,          0,      1.097
 256,  4081,  160,        23,          0,      1.255
 192,     6,  192,        23,          0,      0.869
 192,     6,  192,         0,          0,      0.869
 256,     6,  192,        23,          0,      0.903
 256,     6,  192,         0,          0,      0.899
 512,     6,  192,        23,          0,      0.999
 512,     6,  192,         0,          0,        1.0
 256,  4081,  192,        23,          0,       0.91
 192,     7,  224,        23,          0,      0.869
 192,     7,  224,         0,          0,      0.868
 256,     7,  224,        23,          0,      0.893
 256,     7,  224,         0,          0,      0.893
 512,     7,  224,        23,          0,      0.718
 512,     7,  224,         0,          0,      0.718
 256,  4081,  224,        23,          0,      0.903
   2,     0,    1,        23,          0,      1.026
   2,     0,    1,         0,          0,      1.029
   2,     1,    1,        23,          0,      0.874
   2,     1,    1,         0,          0,      0.875
   0,     0,    1,        23,          0,      0.583
   0,     0,    1,         0,          0,      0.583
   0,     1,    1,        23,          0,      0.539
   0,     1,    1,         0,          0,      0.538
   2,  2048,    1,        23,          0,      0.751
   2,  2048,    1,         0,          0,      0.749
   2,  2049,    1,        23,          0,      0.638
   2,  2049,    1,         0,          0,      0.638
   0,  2048,    1,        23,          0,        0.5
   0,  2048,    1,         0,          0,        0.5
   0,  2049,    1,        23,          0,      0.462
   0,  2049,    1,         0,          0,      0.462
   0,  4081,    1,        23,          0,      0.462
   0,  4081,    1,         0,          0,      0.462
   2,  4081,    1,        23,          0,       0.61
   2,  4081,    1,         0,          0,      0.609
   2,     0,    2,         0,          0,      0.889
   3,     0,    2,        23,          0,       1.05
   3,     0,    2,         0,          0,      1.034
   3,     2,    2,        23,          0,        0.9
   3,     2,    2,         0,          0,      0.887
   1,     0,    2,        23,          0,      0.942
   1,     0,    2,         0,          0,      0.941
   1,     2,    2,        23,          0,      1.043
   1,     2,    2,         0,          0,       1.11
   3,  2048,    2,        23,          0,       0.75
   3,  2048,    2,         0,          0,       0.75
   3,  2050,    2,        23,          0,      0.638
   3,  2050,    2,         0,          0,      0.639
   1,  2048,    2,        23,          0,      0.666
   1,  2048,    2,         0,          0,      0.668
   1,  2050,    2,        23,          0,      0.734
   1,  2050,    2,         0,          0,      0.727
   1,  4081,    2,        23,          0,      0.725
   1,  4081,    2,         0,          0,      0.726
   3,  4081,    2,        23,          0,      0.614
   3,  4081,    2,         0,          0,      0.619
   3,     0,    1,        23,          0,      1.043
   4,     0,    3,        23,          0,       1.04
   4,     0,    3,         0,          0,      1.043
   4,     3,    3,        23,          0,      0.886
   4,     3,    3,         0,          0,      0.901
   2,     0,    3,        23,          0,      0.923
   2,     0,    3,         0,          0,      0.933
   2,     3,    3,        23,          0,       1.01
   2,     3,    3,         0,          0,      1.083
   4,  2048,    3,        23,          0,      0.751
   4,  2048,    3,         0,          0,       0.75
   4,  2051,    3,        23,          0,      0.638
   4,  2051,    3,         0,          0,      0.641
   2,  2048,    3,        23,          0,       0.67
   2,  2048,    3,         0,          0,       0.67
   2,  2051,    3,        23,          0,      0.728
   2,  2051,    3,         0,          0,       0.73
   2,  4081,    3,        23,          0,      0.727
   2,  4081,    3,         0,          0,      0.726
   4,  4081,    3,        23,          0,      0.613
   4,  4081,    3,         0,          0,       0.63
   4,     0,    1,        23,          0,      1.073
   4,     0,    2,         0,          0,      1.055
   5,     0,    4,        23,          0,      1.055
   5,     0,    4,         0,          0,      1.066
   5,     4,    4,        23,          0,      0.893
   5,     4,    4,         0,          0,      0.892
   3,     0,    4,        23,          0,      0.911
   3,     0,    4,         0,          0,      0.913
   3,     4,    4,        23,          0,      0.988
   3,     4,    4,         0,          0,      1.055
   5,  2048,    4,        23,          0,      0.751
   5,  2048,    4,         0,          0,      0.752
   5,  2052,    4,        23,          0,       0.64
   5,  2052,    4,         0,          0,      0.639
   3,  2048,    4,        23,          0,      0.668
   3,  2048,    4,         0,          0,      0.669
   3,  2052,    4,        23,          0,       0.73
   3,  2052,    4,         0,          0,      0.731
   3,  4081,    4,        23,          0,      0.726
   3,  4081,    4,         0,          0,       0.73
   5,  4081,    4,        23,          0,       0.62
   5,  4081,    4,         0,          0,      0.611
   5,     0,    1,        23,          0,      1.044
   5,     0,    2,         0,          0,      1.048
   6,     0,    5,        23,          0,      1.062
   6,     0,    5,         0,          0,      1.064
   6,     5,    5,        23,          0,      0.898
   6,     5,    5,         0,          0,      0.896
   4,     0,    5,        23,          0,      0.894
   4,     0,    5,         0,          0,      0.894
   4,     5,    5,        23,          0,      0.974
   4,     5,    5,         0,          0,      1.042
   6,  2048,    5,        23,          0,      0.752
   6,  2048,    5,         0,          0,      0.751
   6,  2053,    5,        23,          0,      0.639
   6,  2053,    5,         0,          0,      0.638
   4,  2048,    5,        23,          0,      0.667
   4,  2048,    5,         0,          0,      0.668
   4,  2053,    5,        23,          0,       0.73
   4,  2053,    5,         0,          0,      0.729
   4,  4081,    5,        23,          0,      0.726
   4,  4081,    5,         0,          0,      0.727
   6,  4081,    5,        23,          0,      0.626
   6,  4081,    5,         0,          0,      0.619
   6,     0,    1,        23,          0,      1.045
   6,     0,    2,         0,          0,      1.049
   7,     0,    6,        23,          0,      1.032
   7,     0,    6,         0,          0,      1.038
   7,     6,    6,        23,          0,      0.889
   7,     6,    6,         0,          0,      0.894
   5,     0,    6,        23,          0,       0.89
   5,     0,    6,         0,          0,      0.891
   5,     6,    6,        23,          0,      0.971
   5,     6,    6,         0,          0,      0.997
   7,  2048,    6,        23,          0,      0.751
   7,  2048,    6,         0,          0,      0.747
   7,  2054,    6,        23,          0,      0.639
   7,  2054,    6,         0,          0,       0.64
   5,  2048,    6,        23,          0,      0.667
   5,  2048,    6,         0,          0,      0.669
   5,  2054,    6,        23,          0,      0.732
   5,  2054,    6,         0,          0,      0.728
   5,  4081,    6,        23,          0,      0.729
   5,  4081,    6,         0,          0,      0.727
   7,  4081,    6,        23,          0,      0.631
   7,  4081,    6,         0,          0,      0.619
   7,     0,    1,        23,          0,      1.042
   7,     0,    2,         0,          0,      1.039
   8,     0,    7,        23,          0,      1.034
   8,     0,    7,         0,          0,       1.04
   8,     7,    7,        23,          0,      0.876
   8,     7,    7,         0,          0,      0.883
   6,     0,    7,        23,          0,      0.891
   6,     0,    7,         0,          0,      0.895
   6,     7,    7,        23,          0,      0.986
   6,     7,    7,         0,          0,      0.996
   8,  2048,    7,        23,          0,      0.754
   8,  2048,    7,         0,          0,      0.754
   8,  2055,    7,        23,          0,      0.638
   8,  2055,    7,         0,          0,      0.638
   6,  2048,    7,        23,          0,      0.667
   6,  2048,    7,         0,          0,       0.67
   6,  2055,    7,        23,          0,       0.73
   6,  2055,    7,         0,          0,      0.729
   6,  4081,    7,        23,          0,      0.726
   6,  4081,    7,         0,          0,      0.727
   8,  4081,    7,        23,          0,       0.61
   8,  4081,    7,         0,          0,      0.616
   8,     0,    1,        23,          0,      1.031
   8,     0,    2,         0,          0,      1.032
   9,     0,    8,        23,          0,      1.044
   9,     0,    8,         0,          0,      1.037
   9,     8,    8,        23,          0,      0.652
   9,     8,    8,         0,          0,      0.643
   7,     0,    8,        23,          0,      0.897
   7,     0,    8,         0,          0,      0.889
   7,     8,    8,        23,          0,      0.969
   7,     8,    8,         0,          0,      1.015
   9,  2048,    8,        23,          0,      0.753
   9,  2048,    8,         0,          0,       0.75
   9,  2056,    8,        23,          0,      0.645
   9,  2056,    8,         0,          0,      0.655
   7,  2048,    8,        23,          0,      0.667
   7,  2048,    8,         0,          0,      0.671
   7,  2056,    8,        23,          0,      0.731
   7,  2056,    8,         0,          0,      0.731
   7,  4081,    8,        23,          0,      0.723
   7,  4081,    8,         0,          0,      0.724
   9,  4081,    8,        23,          0,      0.653
   9,  4081,    8,         0,          0,      0.638
   9,     0,    1,        23,          0,      1.037
   9,     0,    2,         0,          0,      1.032
  10,     0,    9,        23,          0,      1.033
  10,     0,    9,         0,          0,       1.03
  10,     9,    9,        23,          0,       0.66
  10,     9,    9,         0,          0,      0.657
   8,     0,    9,        23,          0,      0.888
   8,     0,    9,         0,          0,      0.891
   8,     9,    9,        23,          0,      0.631
   8,     9,    9,         0,          0,      0.632
  10,  2048,    9,        23,          0,      0.767
  10,  2048,    9,         0,          0,      0.759
  10,  2057,    9,        23,          0,      0.666
  10,  2057,    9,         0,          0,      0.647
   8,  2048,    9,        23,          0,      0.669
   8,  2048,    9,         0,          0,      0.668
   8,  2057,    9,        23,          0,      0.629
   8,  2057,    9,         0,          0,      0.641
   8,  4081,    9,        23,          0,      0.727
   8,  4081,    9,         0,          0,      0.764
  10,  4081,    9,        23,          0,      0.642
  10,  4081,    9,         0,          0,      0.653
  10,     0,    1,        23,          0,      1.031
  10,     0,    2,         0,          0,      1.038
  11,     0,   10,        23,          0,      1.032
  11,     0,   10,         0,          0,      1.029
  11,    10,   10,        23,          0,      0.652
  11,    10,   10,         0,          0,      0.656
   9,     0,   10,        23,          0,      0.893
   9,     0,   10,         0,          0,      0.894
   9,    10,   10,        23,          0,      0.629
   9,    10,   10,         0,          0,      0.707
  11,  2048,   10,        23,          0,      0.753
  11,  2048,   10,         0,          0,      0.749
  11,  2058,   10,        23,          0,      0.662
  11,  2058,   10,         0,          0,      0.661
   9,  2048,   10,        23,          0,      0.673
   9,  2048,   10,         0,          0,      0.666
   9,  2058,   10,        23,          0,      0.629
   9,  2058,   10,         0,          0,      0.663
   9,  4081,   10,        23,          0,      0.727
   9,  4081,   10,         0,          0,      0.779
  11,  4081,   10,        23,          0,      0.624
  11,  4081,   10,         0,          0,      0.619
  11,     0,    1,        23,          0,       1.03
  11,     0,    2,         0,          0,       1.03
  12,     0,   11,        23,          0,      1.039
  12,     0,   11,         0,          0,       1.03
  12,    11,   11,        23,          0,      0.653
  12,    11,   11,         0,          0,      0.652
  10,     0,   11,        23,          0,      0.896
  10,     0,   11,         0,          0,      0.889
  10,    11,   11,        23,          0,      0.628
  10,    11,   11,         0,          0,      0.696
  12,  2048,   11,        23,          0,      0.752
  12,  2048,   11,         0,          0,      0.754
  12,  2059,   11,        23,          0,      0.657
  12,  2059,   11,         0,          0,      0.652
  10,  2048,   11,        23,          0,       0.67
  10,  2048,   11,         0,          0,      0.668
  10,  2059,   11,        23,          0,      0.627
  10,  2059,   11,         0,          0,      0.677
  10,  4081,   11,        23,          0,      0.726
  10,  4081,   11,         0,          0,      0.771
  12,  4081,   11,        23,          0,      0.648
  12,  4081,   11,         0,          0,      0.624
  12,     0,    1,        23,          0,      1.047
  12,     0,    2,         0,          0,      1.042
  13,     0,   12,        23,          0,      1.043
  13,     0,   12,         0,          0,       1.04
  13,    12,   12,        23,          0,       0.66
  13,    12,   12,         0,          0,      0.647
  11,     0,   12,        23,          0,      0.891
  11,     0,   12,         0,          0,      0.895
  11,    12,   12,        23,          0,      0.629
  11,    12,   12,         0,          0,      0.655
  13,  2048,   12,        23,          0,      0.749
  13,  2048,   12,         0,          0,      0.748
  13,  2060,   12,        23,          0,      0.647
  13,  2060,   12,         0,          0,      0.636
  11,  2048,   12,        23,          0,      0.669
  11,  2048,   12,         0,          0,      0.668
  11,  2060,   12,        23,          0,      0.627
  11,  2060,   12,         0,          0,      0.664
  11,  4081,   12,        23,          0,      0.725
  11,  4081,   12,         0,          0,      0.766
  13,  4081,   12,        23,          0,      0.674
  13,  4081,   12,         0,          0,      0.633
  13,     0,    1,        23,          0,      1.036
  13,     0,    2,         0,          0,      1.029
  14,     0,   13,        23,          0,      1.029
  14,     0,   13,         0,          0,      1.032
  14,    13,   13,        23,          0,      0.646
  14,    13,   13,         0,          0,      0.655
  12,     0,   13,        23,          0,      0.889
  12,     0,   13,         0,          0,       0.89
  12,    13,   13,        23,          0,      0.628
  12,    13,   13,         0,          0,      0.684
  14,  2048,   13,        23,          0,      0.748
  14,  2048,   13,         0,          0,      0.749
  14,  2061,   13,        23,          0,      0.644
  14,  2061,   13,         0,          0,      0.651
  12,  2048,   13,        23,          0,       0.67
  12,  2048,   13,         0,          0,      0.667
  12,  2061,   13,        23,          0,      0.627
  12,  2061,   13,         0,          0,      0.655
  12,  4081,   13,        23,          0,      0.725
  12,  4081,   13,         0,          0,      0.758
  14,  4081,   13,        23,          0,      0.645
  14,  4081,   13,         0,          0,      0.638
  14,     0,    1,        23,          0,      1.046
  14,     0,    2,         0,          0,      1.029
  15,     0,   14,        23,          0,      1.028
  15,     0,   14,         0,          0,      1.029
  15,    14,   14,        23,          0,       0.65
  15,    14,   14,         0,          0,      0.671
  13,     0,   14,        23,          0,      0.891
  13,     0,   14,         0,          0,       0.89
  13,    14,   14,        23,          0,      0.637
  13,    14,   14,         0,          0,      0.628
  15,  2048,   14,        23,          0,       0.75
  15,  2048,   14,         0,          0,      0.751
  15,  2062,   14,        23,          0,      0.647
  15,  2062,   14,         0,          0,      0.655
  13,  2048,   14,        23,          0,      0.667
  13,  2048,   14,         0,          0,      0.667
  13,  2062,   14,        23,          0,      0.658
  13,  2062,   14,         0,          0,      0.655
  13,  4081,   14,        23,          0,      0.726
  13,  4081,   14,         0,          0,      0.778
  15,  4081,   14,        23,          0,      0.872
  15,  4081,   14,         0,          0,      0.872
  15,     0,    1,        23,          0,      1.052
  15,     0,    2,         0,          0,      1.028
  16,     0,   15,        23,          0,      0.724
  16,     0,   15,         0,          0,      0.724
  16,    15,   15,        23,          0,       0.65
  16,    15,   15,         0,          0,       0.65
  14,     0,   15,        23,          0,      0.889
  14,     0,   15,         0,          0,      0.889
  14,    15,   15,        23,          0,      0.626
  14,    15,   15,         0,          0,      0.665
  16,  2048,   15,        23,          0,      0.735
  16,  2048,   15,         0,          0,      0.717
  16,  2063,   15,        23,          0,      0.648
  16,  2063,   15,         0,          0,      0.651
  14,  2048,   15,        23,          0,      0.667
  14,  2048,   15,         0,          0,      0.667
  14,  2063,   15,        23,          0,      0.627
  14,  2063,   15,         0,          0,      0.694
  14,  4081,   15,        23,          0,      0.725
  14,  4081,   15,         0,          0,      0.801
  16,  4081,   15,        23,          0,      0.999
  16,  4081,   15,         0,          0,      0.999
  16,     0,    1,        23,          0,      0.751
  16,     0,    2,         0,          0,      0.731
  17,     0,   16,        23,          0,      1.167
  17,     0,   16,         0,          0,      1.165
  17,    16,   16,        23,          0,      1.167
  17,    16,   16,         0,          0,      1.167
  15,     0,   16,        23,          0,      0.889
  15,     0,   16,         0,          0,      0.889
  15,    16,   16,        23,          0,      0.666
  15,    16,   16,         0,          0,      0.712
  17,  2048,   16,        23,          0,      1.167
  17,  2048,   16,         0,          0,      1.167
  17,  2064,   16,        23,          0,      1.167
  17,  2064,   16,         0,          0,      1.167
  15,  2048,   16,        23,          0,      0.667
  15,  2048,   16,         0,          0,      0.667
  15,  2064,   16,        23,          0,      0.667
  15,  2064,   16,         0,          0,      0.696
  15,  4081,   16,        23,          0,      0.956
  15,  4081,   16,         0,          0,      1.098
  17,  4081,   16,        23,          0,        1.5
  17,  4081,   16,         0,          0,        1.5
  17,     0,    1,        23,          0,      1.167
  17,     0,    2,         0,          0,      1.167
  18,     0,   17,        23,          0,      1.167
  18,     0,   17,         0,          0,      1.167
  18,    17,   17,        23,          0,      1.167
  18,    17,   17,         0,          0,      1.167
  16,     0,   17,        23,          0,      0.667
  16,     0,   17,         0,          0,      0.667
  16,    17,   17,        23,          0,      0.627
  16,    17,   17,         0,          0,      0.627
  18,  2048,   17,        23,          0,      1.167
  18,  2048,   17,         0,          0,      1.167
  18,  2065,   17,        23,          0,      1.167
  18,  2065,   17,         0,          0,      1.167
  16,  2048,   17,        23,          0,      0.667
  16,  2048,   17,         0,          0,      0.667
  16,  2065,   17,        23,          0,      0.627
  16,  2065,   17,         0,          0,      0.627
  16,  4081,   17,        23,          0,      1.046
  16,  4081,   17,         0,          0,      1.095
  18,  4081,   17,        23,          0,        1.5
  18,  4081,   17,         0,          0,        1.5
  18,     0,    1,        23,          0,      0.852
  18,     0,    2,         0,          0,      1.167
  19,     0,   18,        23,          0,      1.167
  19,     0,   18,         0,          0,      1.167
  19,    18,   18,        23,          0,      1.167
  19,    18,   18,         0,          0,      1.167
  17,     0,   18,        23,          0,      0.889
  17,     0,   18,         0,          0,      0.889
  17,    18,   18,        23,          0,      0.889
  17,    18,   18,         0,          0,        0.8
  19,  2048,   18,        23,          0,      1.167
  19,  2048,   18,         0,          0,      1.167
  19,  2066,   18,        23,          0,      1.167
  19,  2066,   18,         0,          0,      1.167
  17,  2048,   18,        23,          0,      0.889
  17,  2048,   18,         0,          0,      0.889
  17,  2066,   18,        23,          0,      0.889
  17,  2066,   18,         0,          0,        0.8
  17,  4081,   18,        23,          0,       1.11
  17,  4081,   18,         0,          0,      1.047
  19,  4081,   18,        23,          0,        1.5
  19,  4081,   18,         0,          0,        1.5
  19,     0,    1,        23,          0,      0.897
  19,     0,    2,         0,          0,      0.878
  20,     0,   19,        23,          0,      1.167
  20,     0,   19,         0,          0,      1.167
  20,    19,   19,        23,          0,      1.167
  20,    19,   19,         0,          0,      1.167
  18,     0,   19,        23,          0,      0.889
  18,     0,   19,         0,          0,      0.889
  18,    19,   19,        23,          0,      0.889
  18,    19,   19,         0,          0,        0.8
  20,  2048,   19,        23,          0,      1.167
  20,  2048,   19,         0,          0,      1.167
  20,  2067,   19,        23,          0,      1.167
  20,  2067,   19,         0,          0,      1.167
  18,  2048,   19,        23,          0,      0.889
  18,  2048,   19,         0,          0,      0.889
  18,  2067,   19,        23,          0,      0.889
  18,  2067,   19,         0,          0,        0.8
  18,  4081,   19,        23,          0,       1.11
  18,  4081,   19,         0,          0,      1.047
  20,  4081,   19,        23,          0,        1.5
  20,  4081,   19,         0,          0,        1.5
  20,     0,    1,        23,          0,      0.906
  20,     0,    2,         0,          0,      0.899
  21,     0,   20,        23,          0,      1.167
  21,     0,   20,         0,          0,      1.167
  21,    20,   20,        23,          0,      1.167
  21,    20,   20,         0,          0,      1.167
  19,     0,   20,        23,          0,      0.889
  19,     0,   20,         0,          0,      0.889
  19,    20,   20,        23,          0,      0.889
  19,    20,   20,         0,          0,        0.8
  21,  2048,   20,        23,          0,      1.167
  21,  2048,   20,         0,          0,      1.167
  21,  2068,   20,        23,          0,      1.167
  21,  2068,   20,         0,          0,      1.167
  19,  2048,   20,        23,          0,      0.889
  19,  2048,   20,         0,          0,      0.889
  19,  2068,   20,        23,          0,      0.889
  19,  2068,   20,         0,          0,        0.8
  19,  4081,   20,        23,          0,       1.11
  19,  4081,   20,         0,          0,      1.047
  21,  4081,   20,        23,          0,        1.5
  21,  4081,   20,         0,          0,        1.5
  21,     0,    1,        23,          0,      0.902
  21,     0,    2,         0,          0,      0.891
  22,     0,   21,        23,          0,      1.167
  22,     0,   21,         0,          0,      1.167
  22,    21,   21,        23,          0,      1.167
  22,    21,   21,         0,          0,      1.167
  20,     0,   21,        23,          0,      0.889
  20,     0,   21,         0,          0,      0.889
  20,    21,   21,        23,          0,      0.889
  20,    21,   21,         0,          0,        0.8
  22,  2048,   21,        23,          0,      1.167
  22,  2048,   21,         0,          0,      1.167
  22,  2069,   21,        23,          0,      1.167
  22,  2069,   21,         0,          0,      1.167
  20,  2048,   21,        23,          0,      0.889
  20,  2048,   21,         0,          0,      0.889
  20,  2069,   21,        23,          0,      0.889
  20,  2069,   21,         0,          0,        0.8
  20,  4081,   21,        23,          0,       1.11
  20,  4081,   21,         0,          0,       1.06
  22,  4081,   21,        23,          0,        1.5
  22,  4081,   21,         0,          0,        1.5
  22,     0,    1,        23,          0,      0.915
  22,     0,    2,         0,          0,      0.906
  23,     0,   22,        23,          0,      1.167
  23,     0,   22,         0,          0,      1.167
  23,    22,   22,        23,          0,      1.167
  23,    22,   22,         0,          0,      1.167
  21,     0,   22,        23,          0,      0.889
  21,     0,   22,         0,          0,      0.889
  21,    22,   22,        23,          0,      0.889
  21,    22,   22,         0,          0,        0.8
  23,  2048,   22,        23,          0,      1.167
  23,  2048,   22,         0,          0,      1.167
  23,  2070,   22,        23,          0,      1.167
  23,  2070,   22,         0,          0,      1.167
  21,  2048,   22,        23,          0,      0.889
  21,  2048,   22,         0,          0,      0.889
  21,  2070,   22,        23,          0,      0.889
  21,  2070,   22,         0,          0,        0.8
  21,  4081,   22,        23,          0,       1.11
  21,  4081,   22,         0,          0,      1.059
  23,  4081,   22,        23,          0,        1.5
  23,  4081,   22,         0,          0,        1.5
  23,     0,    1,        23,          0,      0.914
  23,     0,    2,         0,          0,      0.907
  24,     0,   23,        23,          0,      1.167
  24,     0,   23,         0,          0,      1.167
  24,    23,   23,        23,          0,      1.167
  24,    23,   23,         0,          0,      1.167
  22,     0,   23,        23,          0,      0.889
  22,     0,   23,         0,          0,      0.889
  22,    23,   23,        23,          0,      0.889
  22,    23,   23,         0,          0,        0.8
  24,  2048,   23,        23,          0,      1.167
  24,  2048,   23,         0,          0,      1.167
  24,  2071,   23,        23,          0,      1.167
  24,  2071,   23,         0,          0,      1.167
  22,  2048,   23,        23,          0,      0.889
  22,  2048,   23,         0,          0,      0.889
  22,  2071,   23,        23,          0,      0.889
  22,  2071,   23,         0,          0,        0.8
  22,  4081,   23,        23,          0,       1.11
  22,  4081,   23,         0,          0,      1.049
  24,  4081,   23,        23,          0,        1.5
  24,  4081,   23,         0,          0,        1.5
  24,     0,    1,        23,          0,      0.915
  24,     0,    2,         0,          0,      0.915
  25,     0,   24,        23,          0,      1.167
  25,     0,   24,         0,          0,      1.167
  25,    24,   24,        23,          0,      1.167
  25,    24,   24,         0,          0,      1.167
  23,     0,   24,        23,          0,      0.889
  23,     0,   24,         0,          0,      0.889
  23,    24,   24,        23,          0,      0.889
  23,    24,   24,         0,          0,        0.8
  25,  2048,   24,        23,          0,      1.167
  25,  2048,   24,         0,          0,      1.167
  25,  2072,   24,        23,          0,      1.167
  25,  2072,   24,         0,          0,      1.167
  23,  2048,   24,        23,          0,      0.889
  23,  2048,   24,         0,          0,      0.889
  23,  2072,   24,        23,          0,      0.889
  23,  2072,   24,         0,          0,        0.8
  23,  4081,   24,        23,          0,       1.11
  23,  4081,   24,         0,          0,       1.05
  25,  4081,   24,        23,          0,        1.5
  25,  4081,   24,         0,          0,        1.5
  25,     0,    1,        23,          0,      0.917
  25,     0,    2,         0,          0,      0.918
  26,     0,   25,        23,          0,      1.167
  26,     0,   25,         0,          0,      1.167
  26,    25,   25,        23,          0,      1.167
  26,    25,   25,         0,          0,      1.167
  24,     0,   25,        23,          0,      0.889
  24,     0,   25,         0,          0,      0.889
  24,    25,   25,        23,          0,      0.898
  24,    25,   25,         0,          0,      0.832
  26,  2048,   25,        23,          0,      1.167
  26,  2048,   25,         0,          0,      1.167
  26,  2073,   25,        23,          0,      1.167
  26,  2073,   25,         0,          0,      1.167
  24,  2048,   25,        23,          0,      0.889
  24,  2048,   25,         0,          0,      0.889
  24,  2073,   25,        23,          0,      0.879
  24,  2073,   25,         0,          0,      0.814
  24,  4081,   25,        23,          0,       1.11
  24,  4081,   25,         0,          0,      1.049
  26,  4081,   25,        23,          0,        1.5
  26,  4081,   25,         0,          0,        1.5
  26,     0,    1,        23,          0,      0.869
  26,     0,    2,         0,          0,      0.869
  27,     0,   26,        23,          0,      1.167
  27,     0,   26,         0,          0,      1.167
  27,    26,   26,        23,          0,      1.167
  27,    26,   26,         0,          0,      1.167
  25,     0,   26,        23,          0,      0.889
  25,     0,   26,         0,          0,      0.889
  25,    26,   26,        23,          0,      0.871
  25,    26,   26,         0,          0,      0.827
  27,  2048,   26,        23,          0,      1.167
  27,  2048,   26,         0,          0,      1.167
  27,  2074,   26,        23,          0,      1.167
  27,  2074,   26,         0,          0,      1.167
  25,  2048,   26,        23,          0,      0.889
  25,  2048,   26,         0,          0,      0.889
  25,  2074,   26,        23,          0,       0.88
  25,  2074,   26,         0,          0,      0.823
  25,  4081,   26,        23,          0,       1.11
  25,  4081,   26,         0,          0,      1.047
  27,  4081,   26,        23,          0,        1.5
  27,  4081,   26,         0,          0,        1.5
  27,     0,    1,        23,          0,      0.865
  27,     0,    2,         0,          0,      0.857
  28,     0,   27,        23,          0,      1.167
  28,     0,   27,         0,          0,      1.167
  28,    27,   27,        23,          0,      1.167
  28,    27,   27,         0,          0,      1.167
  26,     0,   27,        23,          0,      0.889
  26,     0,   27,         0,          0,      0.889
  26,    27,   27,        23,          0,      0.884
  26,    27,   27,         0,          0,       0.82
  28,  2048,   27,        23,          0,      1.167
  28,  2048,   27,         0,          0,      1.167
  28,  2075,   27,        23,          0,      1.167
  28,  2075,   27,         0,          0,      1.167
  26,  2048,   27,        23,          0,      0.889
  26,  2048,   27,         0,          0,      0.889
  26,  2075,   27,        23,          0,      0.892
  26,  2075,   27,         0,          0,       0.83
  26,  4081,   27,        23,          0,       1.11
  26,  4081,   27,         0,          0,      1.054
  28,  4081,   27,        23,          0,        1.5
  28,  4081,   27,         0,          0,        1.5
  28,     0,    1,        23,          0,      0.866
  28,     0,    2,         0,          0,      0.867
  29,     0,   28,        23,          0,      1.167
  29,     0,   28,         0,          0,      1.167
  29,    28,   28,        23,          0,      1.167
  29,    28,   28,         0,          0,      1.167
  27,     0,   28,        23,          0,      0.889
  27,     0,   28,         0,          0,      0.889
  27,    28,   28,        23,          0,      0.892
  27,    28,   28,         0,          0,      0.825
  29,  2048,   28,        23,          0,      1.167
  29,  2048,   28,         0,          0,      1.167
  29,  2076,   28,        23,          0,      1.167
  29,  2076,   28,         0,          0,      1.167
  27,  2048,   28,        23,          0,      0.889
  27,  2048,   28,         0,          0,      0.888
  27,  2076,   28,        23,          0,      0.898
  27,  2076,   28,         0,          0,      0.821
  27,  4081,   28,        23,          0,       1.11
  27,  4081,   28,         0,          0,      1.052
  29,  4081,   28,        23,          0,        1.5
  29,  4081,   28,         0,          0,        1.5
  29,     0,    1,        23,          0,      0.854
  29,     0,    2,         0,          0,       0.86
  30,     0,   29,        23,          0,      1.166
  30,     0,   29,         0,          0,      1.167
  30,    29,   29,        23,          0,      1.167
  30,    29,   29,         0,          0,      1.167
  28,     0,   29,        23,          0,      0.887
  28,     0,   29,         0,          0,      0.888
  28,    29,   29,        23,          0,      0.891
  28,    29,   29,         0,          0,      0.843
  30,  2048,   29,        23,          0,      1.166
  30,  2048,   29,         0,          0,      1.167
  30,  2077,   29,        23,          0,      1.167
  30,  2077,   29,         0,          0,      1.165
  28,  2048,   29,        23,          0,      0.886
  28,  2048,   29,         0,          0,      0.887
  28,  2077,   29,        23,          0,      0.891
  28,  2077,   29,         0,          0,      0.836
  28,  4081,   29,        23,          0,      1.106
  28,  4081,   29,         0,          0,      1.063
  30,  4081,   29,        23,          0,      1.496
  30,  4081,   29,         0,          0,      1.496
  30,     0,    1,        23,          0,      0.874
  30,     0,    2,         0,          0,      0.873
  31,     0,   30,        23,          0,      1.164
  31,     0,   30,         0,          0,      1.161
  31,    30,   30,        23,          0,      1.162
  31,    30,   30,         0,          0,      1.163
  29,     0,   30,        23,          0,      0.884
  29,     0,   30,         0,          0,      0.884
  29,    30,   30,        23,          0,      0.893
  29,    30,   30,         0,          0,      0.847
  31,  2048,   30,        23,          0,      1.163
  31,  2048,   30,         0,          0,      1.162
  31,  2078,   30,        23,          0,      1.161
  31,  2078,   30,         0,          0,      1.161
  29,  2048,   30,        23,          0,      0.884
  29,  2048,   30,         0,          0,      0.884
  29,  2078,   30,        23,          0,      0.894
  29,  2078,   30,         0,          0,      0.848
  29,  4081,   30,        23,          0,      1.102
  29,  4081,   30,         0,          0,      1.074
  31,  4081,   30,        23,          0,      1.159
  31,  4081,   30,         0,          0,       1.16
  31,     0,    1,        23,          0,      0.859
  31,     0,    2,         0,          0,      0.858
  32,     0,   31,        23,          0,      1.161
  32,     0,   31,         0,          0,      1.161
  32,    31,   31,        23,          0,      1.161
  32,    31,   31,         0,          0,      1.161
  30,     0,   31,        23,          0,      0.882
  30,     0,   31,         0,          0,      0.883
  30,    31,   31,        23,          0,      0.897
  30,    31,   31,         0,          0,      0.854
  32,  2048,   31,        23,          0,      1.161
  32,  2048,   31,         0,          0,      1.161
  32,  2079,   31,        23,          0,      1.159
  32,  2079,   31,         0,          0,      1.158
  30,  2048,   31,        23,          0,      0.881
  30,  2048,   31,         0,          0,      0.882
  30,  2079,   31,        23,          0,      0.891
  30,  2079,   31,         0,          0,      0.851
  30,  4081,   31,        23,          0,        1.1
  30,  4081,   31,         0,          0,      1.066
  32,  4081,   31,        23,          0,      1.157
  32,  4081,   31,         0,          0,      1.157
  32,     0,    1,        23,          0,      0.798
  32,     0,    2,         0,          0,      0.798
2048,     0,   32,        23,          1,      0.993
 256,     1,   64,        23,          1,       0.89
2048,     0,   32,         0,          1,      0.992
 256,     1,   64,         0,          1,      0.894
 256,  4081,   64,         0,          1,      0.903
 256,     0,    1,        23,          1,      1.158
 256,     0,    1,         0,          1,      1.157
 256,     1,    1,        23,          1,      1.158
 256,     1,    1,         0,          1,      1.158
2048,     0,   64,        23,          1,       0.79
 256,     2,   64,        23,          1,      0.894
2048,     0,   64,         0,          1,       0.79
 256,     2,   64,         0,          1,      0.894
 256,     0,    2,        23,          1,      1.161
 256,     0,    2,         0,          1,      1.161
 256,     2,    2,        23,          1,      1.161
 256,     2,    2,         0,          1,      1.161
2048,     0,  128,        23,          1,      1.319
 256,     3,   64,        23,          1,      0.897
2048,     0,  128,         0,          1,      1.323
 256,     3,   64,         0,          1,        0.9
 256,     0,    3,        23,          1,      1.166
 256,     0,    3,         0,          1,      1.167
 256,     3,    3,        23,          1,      1.167
 256,     3,    3,         0,          1,      1.167
2048,     0,  256,        23,          1,      0.995
 256,     4,   64,        23,          1,      0.902
2048,     0,  256,         0,          1,      0.993
 256,     4,   64,         0,          1,      0.901
 256,     0,    4,        23,          1,      1.167
 256,     0,    4,         0,          1,      1.167
 256,     4,    4,        23,          1,      1.167
 256,     4,    4,         0,          1,      1.167
2048,     0,  512,        23,          1,      1.109
 256,     5,   64,        23,          1,      0.903
2048,     0,  512,         0,          1,      1.109
 256,     5,   64,         0,          1,      0.897
 256,     0,    5,        23,          1,      1.167
 256,     0,    5,         0,          1,      1.167
 256,     5,    5,        23,          1,      1.167
 256,     5,    5,         0,          1,      1.167
2048,     0, 1024,        23,          1,      0.951
 256,     6,   64,        23,          1,      0.902
2048,     0, 1024,         0,          1,      0.953
 256,     6,   64,         0,          1,        0.9
 256,     0,    6,        23,          1,      1.167
 256,     0,    6,         0,          1,      1.167
 256,     6,    6,        23,          1,      1.167
 256,     6,    6,         0,          1,      1.167
2048,     0, 2048,        23,          1,      0.896
 256,     7,   64,        23,          1,      0.901
2048,     0, 2048,         0,          1,      0.845
 256,     7,   64,         0,          1,        0.9
 256,     0,    7,        23,          1,      1.165
 256,     0,    7,         0,          1,      1.165
 256,     7,    7,        23,          1,      1.165
 256,     7,    7,         0,          1,      1.165
 192,     1,   32,        23,          1,      0.892
 192,     1,   32,         0,          1,      0.892
 256,     1,   32,        23,          1,      0.892
 256,     1,   32,         0,          1,      0.892
 512,     1,   32,        23,          1,      0.892
 512,     1,   32,         0,          1,      0.892
 256,  4081,   32,        23,          1,      0.902
 192,     2,   64,        23,          1,      0.902
 192,     2,   64,         0,          1,      0.898
 512,     2,   64,        23,          1,        0.9
 512,     2,   64,         0,          1,      0.899
 256,  4081,   64,        23,          1,      0.908
 192,     3,   96,        23,          1,      1.174
 192,     3,   96,         0,          1,      1.174
 256,     3,   96,        23,          1,      1.174
 256,     3,   96,         0,          1,      1.174
 512,     3,   96,        23,          1,      1.174
 512,     3,   96,         0,          1,      1.174
 256,  4081,   96,        23,          1,      1.255
 192,     4,  128,        23,          1,       1.08
 192,     4,  128,         0,          1,      1.081
 256,     4,  128,        23,          1,      1.081
 256,     4,  128,         0,          1,      1.081
 512,     4,  128,        23,          1,       1.08
 512,     4,  128,         0,          1,      1.081
 256,  4081,  128,        23,          1,       1.25
 192,     5,  160,        23,          1,      0.862
 192,     5,  160,         0,          1,      0.864
 256,     5,  160,        23,          1,      0.729
 256,     5,  160,         0,          1,       0.73
 512,     5,  160,        23,          1,       0.73
 512,     5,  160,         0,          1,      0.729
 256,  4081,  160,        23,          1,      0.834
 192,     6,  192,        23,          1,      0.868
 192,     6,  192,         0,          1,      0.868
 256,     6,  192,        23,          1,      0.903
 256,     6,  192,         0,          1,      0.903
 512,     6,  192,        23,          1,      0.902
 512,     6,  192,         0,          1,      0.902
 256,  4081,  192,        23,          1,      0.902
 192,     7,  224,        23,          1,      0.866
 192,     7,  224,         0,          1,      0.865
 256,     7,  224,        23,          1,      0.885
 256,     7,  224,         0,          1,      0.885
 512,     7,  224,        23,          1,      0.948
 512,     7,  224,         0,          1,       0.95
 256,  4081,  224,        23,          1,      0.921
   2,     0,    1,        23,          1,       1.02
   2,     0,    1,         0,          1,      1.026
   2,     1,    1,        23,          1,      0.873
   2,     1,    1,         0,          1,      0.873
   0,     0,    1,        23,          1,      0.581
   0,     0,    1,         0,          1,      0.581
   0,     1,    1,        23,          1,      0.537
   0,     1,    1,         0,          1,      0.537
   2,  2048,    1,        23,          1,      0.749
   2,  2048,    1,         0,          1,      0.747
   2,  2049,    1,        23,          1,      0.636
   2,  2049,    1,         0,          1,      0.636
   0,  2048,    1,        23,          1,      0.498
   0,  2048,    1,         0,          1,      0.498
   0,  2049,    1,        23,          1,       0.46
   0,  2049,    1,         0,          1,       0.46
   0,  4081,    1,        23,          1,       0.46
   0,  4081,    1,         0,          1,       0.46
   2,  4081,    1,        23,          1,      0.611
   2,  4081,    1,         0,          1,      0.608
   2,     0,    2,         0,          1,      0.894
   3,     0,    2,        23,          1,       1.06
   3,     0,    2,         0,          1,      1.027
   3,     2,    2,        23,          1,      0.893
   3,     2,    2,         0,          1,      0.892
   1,     0,    2,        23,          1,      0.916
   1,     0,    2,         0,          1,      0.916
   1,     2,    2,        23,          1,      1.004
   1,     2,    2,         0,          1,      1.094
   3,  2048,    2,        23,          1,      0.749
   3,  2048,    2,         0,          1,      0.749
   3,  2050,    2,        23,          1,      0.636
   3,  2050,    2,         0,          1,      0.636
   1,  2048,    2,        23,          1,      0.665
   1,  2048,    2,         0,          1,      0.664
   1,  2050,    2,        23,          1,      0.723
   1,  2050,    2,         0,          1,      0.724
   1,  4081,    2,        23,          1,       0.72
   1,  4081,    2,         0,          1,       0.74
   3,  4081,    2,        23,          1,      0.618
   3,  4081,    2,         0,          1,      0.597
   3,     0,    1,        23,          1,      1.032
   4,     0,    3,        23,          1,      1.036
   4,     0,    3,         0,          1,       1.05
   4,     3,    3,        23,          1,      0.889
   4,     3,    3,         0,          1,      0.881
   2,     0,    3,        23,          1,      0.907
   2,     0,    3,         0,          1,      0.902
   2,     3,    3,        23,          1,      0.982
   2,     3,    3,         0,          1,      1.059
   4,  2048,    3,        23,          1,      0.748
   4,  2048,    3,         0,          1,      0.748
   4,  2051,    3,        23,          1,      0.637
   4,  2051,    3,         0,          1,      0.637
   2,  2048,    3,        23,          1,      0.664
   2,  2048,    3,         0,          1,      0.665
   2,  2051,    3,        23,          1,      0.726
   2,  2051,    3,         0,          1,      0.726
   2,  4081,    3,        23,          1,      0.727
   2,  4081,    3,         0,          1,      0.727
   4,  4081,    3,        23,          1,      0.647
   4,  4081,    3,         0,          1,      0.622
   4,     0,    1,        23,          1,      1.043
   4,     0,    2,         0,          1,      1.039
   5,     0,    4,        23,          1,      1.043
   5,     0,    4,         0,          1,      1.055
   5,     4,    4,        23,          1,      0.889
   5,     4,    4,         0,          1,      0.878
   3,     0,    4,        23,          1,      0.889
   3,     0,    4,         0,          1,      0.894
   3,     4,    4,        23,          1,      0.958
   3,     4,    4,         0,          1,      1.033
   5,  2048,    4,        23,          1,       0.75
   5,  2048,    4,         0,          1,       0.75
   5,  2052,    4,        23,          1,      0.638
   5,  2052,    4,         0,          1,      0.637
   3,  2048,    4,        23,          1,      0.666
   3,  2048,    4,         0,          1,      0.666
   3,  2052,    4,        23,          1,      0.706
   3,  2052,    4,         0,          1,      0.713
   3,  4081,    4,        23,          1,      0.712
   3,  4081,    4,         0,          1,      0.711
   5,  4081,    4,        23,          1,       0.66
   5,  4081,    4,         0,          1,      0.629
   5,     0,    1,        23,          1,      1.026
   5,     0,    2,         0,          1,      1.057
   6,     0,    5,        23,          1,      1.031
   6,     0,    5,         0,          1,      1.029
   6,     5,    5,        23,          1,      0.894
   6,     5,    5,         0,          1,      0.887
   4,     0,    5,        23,          1,      0.889
   4,     0,    5,         0,          1,      0.889
   4,     5,    5,        23,          1,      0.932
   4,     5,    5,         0,          1,      1.026
   6,  2048,    5,        23,          1,      0.749
   6,  2048,    5,         0,          1,       0.75
   6,  2053,    5,        23,          1,      0.638
   6,  2053,    5,         0,          1,      0.638
   4,  2048,    5,        23,          1,      0.667
   4,  2048,    5,         0,          1,      0.667
   4,  2053,    5,        23,          1,      0.692
   4,  2053,    5,         0,          1,      0.699
   4,  4081,    5,        23,          1,      0.698
   4,  4081,    5,         0,          1,      0.698
   6,  4081,    5,        23,          1,      0.639
   6,  4081,    5,         0,          1,      0.619
   6,     0,    1,        23,          1,      1.027
   6,     0,    2,         0,          1,      1.026
   7,     0,    6,        23,          1,      1.028
   7,     0,    6,         0,          1,      1.028
   7,     6,    6,        23,          1,      0.874
   7,     6,    6,         0,          1,      0.882
   5,     0,    6,        23,          1,      0.888
   5,     0,    6,         0,          1,      0.888
   5,     6,    6,        23,          1,      0.942
   5,     6,    6,         0,          1,      1.014
   7,  2048,    6,        23,          1,       0.75
   7,  2048,    6,         0,          1,      0.749
   7,  2054,    6,        23,          1,      0.637
   7,  2054,    6,         0,          1,      0.638
   5,  2048,    6,        23,          1,      0.667
   5,  2048,    6,         0,          1,      0.666
   5,  2054,    6,        23,          1,      0.706
   5,  2054,    6,         0,          1,      0.702
   5,  4081,    6,        23,          1,      0.705
   5,  4081,    6,         0,          1,      0.705
   7,  4081,    6,        23,          1,      0.659
   7,  4081,    6,         0,          1,      0.638
   7,     0,    1,        23,          1,      1.042
   7,     0,    2,         0,          1,      1.035
   8,     0,    7,        23,          1,      1.033
   8,     0,    7,         0,          1,      1.027
   8,     7,    7,        23,          1,      0.886
   8,     7,    7,         0,          1,      0.875
   6,     0,    7,        23,          1,      0.889
   6,     0,    7,         0,          1,      0.889
   6,     7,    7,        23,          1,      0.912
   6,     7,    7,         0,          1,      0.982
   8,  2048,    7,        23,          1,      0.755
   8,  2048,    7,         0,          1,      0.749
   8,  2055,    7,        23,          1,      0.638
   8,  2055,    7,         0,          1,      0.638
   6,  2048,    7,        23,          1,      0.667
   6,  2048,    7,         0,          1,      0.667
   6,  2055,    7,        23,          1,      0.692
   6,  2055,    7,         0,          1,      0.693
   6,  4081,    7,        23,          1,      0.689
   6,  4081,    7,         0,          1,      0.723
   8,  4081,    7,        23,          1,       0.64
   8,  4081,    7,         0,          1,      0.631
   8,     0,    1,        23,          1,      1.028
   8,     0,    2,         0,          1,      1.039
   9,     0,    8,        23,          1,      1.029
   9,     0,    8,         0,          1,      1.028
   9,     8,    8,        23,          1,       0.55
   9,     8,    8,         0,          1,      0.542
   7,     0,    8,        23,          1,      0.889
   7,     0,    8,         0,          1,      0.889
   7,     8,    8,        23,          1,      0.934
   7,     8,    8,         0,          1,      1.011
   9,  2048,    8,        23,          1,      0.751
   9,  2048,    8,         0,          1,       0.75
   9,  2056,    8,        23,          1,      0.553
   9,  2056,    8,         0,          1,      0.542
   7,  2048,    8,        23,          1,      0.667
   7,  2048,    8,         0,          1,      0.667
   7,  2056,    8,        23,          1,      0.712
   7,  2056,    8,         0,          1,       0.73
   7,  4081,    8,        23,          1,      0.716
   7,  4081,    8,         0,          1,       0.76
   9,  4081,    8,        23,          1,      0.632
   9,  4081,    8,         0,          1,      0.624
   9,     0,    1,        23,          1,      1.028
   9,     0,    2,         0,          1,      1.028
  10,     0,    9,        23,          1,      1.027
  10,     0,    9,         0,          1,      1.028
  10,     9,    9,        23,          1,      0.545
  10,     9,    9,         0,          1,      0.536
   8,     0,    9,        23,          1,      0.889
   8,     0,    9,         0,          1,      0.889
   8,     9,    9,        23,          1,      0.627
   8,     9,    9,         0,          1,      0.637
  10,  2048,    9,        23,          1,      0.751
  10,  2048,    9,         0,          1,       0.75
  10,  2057,    9,        23,          1,      0.545
  10,  2057,    9,         0,          1,      0.547
   8,  2048,    9,        23,          1,      0.667
   8,  2048,    9,         0,          1,      0.667
   8,  2057,    9,        23,          1,      0.627
   8,  2057,    9,         0,          1,      0.633
   8,  4081,    9,        23,          1,      0.726
   8,  4081,    9,         0,          1,      0.775
  10,  4081,    9,        23,          1,      0.657
  10,  4081,    9,         0,          1,      0.642
  10,     0,    1,        23,          1,       1.03
  10,     0,    2,         0,          1,      1.033
  11,     0,   10,        23,          1,      1.029
  11,     0,   10,         0,          1,       1.03
  11,    10,   10,        23,          1,      0.542
  11,    10,   10,         0,          1,      0.549
   9,     0,   10,        23,          1,      0.889
   9,     0,   10,         0,          1,      0.889
   9,    10,   10,        23,          1,      0.627
   9,    10,   10,         0,          1,      0.646
  11,  2048,   10,        23,          1,      0.751
  11,  2048,   10,         0,          1,       0.75
  11,  2058,   10,        23,          1,      0.553
  11,  2058,   10,         0,          1,      0.538
   9,  2048,   10,        23,          1,      0.667
   9,  2048,   10,         0,          1,      0.667
   9,  2058,   10,        23,          1,      0.627
   9,  2058,   10,         0,          1,      0.656
   9,  4081,   10,        23,          1,      0.726
   9,  4081,   10,         0,          1,      0.773
  11,  4081,   10,        23,          1,      0.625
  11,  4081,   10,         0,          1,      0.613
  11,     0,    1,        23,          1,      1.029
  11,     0,    2,         0,          1,      1.029
  12,     0,   11,        23,          1,      1.028
  12,     0,   11,         0,          1,      1.028
  12,    11,   11,        23,          1,      0.545
  12,    11,   11,         0,          1,      0.537
  10,     0,   11,        23,          1,      0.889
  10,     0,   11,         0,          1,      0.889
  10,    11,   11,        23,          1,      0.627
  10,    11,   11,         0,          1,      0.655
  12,  2048,   11,        23,          1,      0.757
  12,  2048,   11,         0,          1,       0.75
  12,  2059,   11,        23,          1,      0.536
  12,  2059,   11,         0,          1,      0.545
  10,  2048,   11,        23,          1,      0.672
  10,  2048,   11,         0,          1,      0.667
  10,  2059,   11,        23,          1,      0.627
  10,  2059,   11,         0,          1,       0.66
  10,  4081,   11,        23,          1,      0.726
  10,  4081,   11,         0,          1,      0.793
  12,  4081,   11,        23,          1,      0.627
  12,  4081,   11,         0,          1,      0.633
  12,     0,    1,        23,          1,      1.028
  12,     0,    2,         0,          1,      1.029
  13,     0,   12,        23,          1,      1.028
  13,     0,   12,         0,          1,      1.028
  13,    12,   12,        23,          1,      0.547
  13,    12,   12,         0,          1,      0.542
  11,     0,   12,        23,          1,      0.889
  11,     0,   12,         0,          1,      0.889
  11,    12,   12,        23,          1,      0.627
  11,    12,   12,         0,          1,       0.69
  13,  2048,   12,        23,          1,       0.75
  13,  2048,   12,         0,          1,       0.75
  13,  2060,   12,        23,          1,       0.55
  13,  2060,   12,         0,          1,      0.542
  11,  2048,   12,        23,          1,      0.667
  11,  2048,   12,         0,          1,      0.667
  11,  2060,   12,        23,          1,      0.627
  11,  2060,   12,         0,          1,      0.646
  11,  4081,   12,        23,          1,      0.726
  11,  4081,   12,         0,          1,       0.78
  13,  4081,   12,        23,          1,      0.632
  13,  4081,   12,         0,          1,      0.619
  13,     0,    1,        23,          1,      1.028
  13,     0,    2,         0,          1,      1.028
  14,     0,   13,        23,          1,      1.032
  14,     0,   13,         0,          1,      1.038
  14,    13,   13,        23,          1,       0.55
  14,    13,   13,         0,          1,      0.539
  12,     0,   13,        23,          1,      0.889
  12,     0,   13,         0,          1,      0.889
  12,    13,   13,        23,          1,      0.627
  12,    13,   13,         0,          1,      0.655
  14,  2048,   13,        23,          1,      0.751
  14,  2048,   13,         0,          1,      0.751
  14,  2061,   13,        23,          1,      0.542
  14,  2061,   13,         0,          1,      0.547
  12,  2048,   13,        23,          1,      0.667
  12,  2048,   13,         0,          1,      0.667
  12,  2061,   13,        23,          1,      0.627
  12,  2061,   13,         0,          1,      0.646
  12,  4081,   13,        23,          1,      0.726
  12,  4081,   13,         0,          1,      0.769
  14,  4081,   13,        23,          1,      0.627
  14,  4081,   13,         0,          1,       0.62
  14,     0,    1,        23,          1,      1.035
  14,     0,    2,         0,          1,      1.033
  15,     0,   14,        23,          1,      1.028
  15,     0,   14,         0,          1,      1.028
  15,    14,   14,        23,          1,      0.545
  15,    14,   14,         0,          1,      0.531
  13,     0,   14,        23,          1,      0.889
  13,     0,   14,         0,          1,      0.889
  13,    14,   14,        23,          1,      0.628
  13,    14,   14,         0,          1,      0.628
  15,  2048,   14,        23,          1,      0.751
  15,  2048,   14,         0,          1,       0.75
  15,  2062,   14,        23,          1,      0.542
  15,  2062,   14,         0,          1,      0.536
  13,  2048,   14,        23,          1,      0.667
  13,  2048,   14,         0,          1,      0.667
  13,  2062,   14,        23,          1,      0.627
  13,  2062,   14,         0,          1,      0.628
  13,  4081,   14,        23,          1,      0.726
  13,  4081,   14,         0,          1,      0.747
  15,  4081,   14,        23,          1,      0.874
  15,  4081,   14,         0,          1,      0.879
  15,     0,    1,        23,          1,      1.028
  15,     0,    2,         0,          1,      1.028
  16,     0,   15,        23,          1,      0.728
  16,     0,   15,         0,          1,      0.735
  16,    15,   15,        23,          1,      0.647
  16,    15,   15,         0,          1,      0.647
  14,     0,   15,        23,          1,      0.889
  14,     0,   15,         0,          1,      0.889
  14,    15,   15,        23,          1,      0.627
  14,    15,   15,         0,          1,      0.647
  16,  2048,   15,        23,          1,      0.732
  16,  2048,   15,         0,          1,      0.714
  16,  2063,   15,        23,          1,       0.65
  16,  2063,   15,         0,          1,       0.65
  14,  2048,   15,        23,          1,      0.667
  14,  2048,   15,         0,          1,      0.667
  14,  2063,   15,        23,          1,      0.627
  14,  2063,   15,         0,          1,      0.674
  14,  4081,   15,        23,          1,      0.724
  14,  4081,   15,         0,          1,      0.777
  16,  4081,   15,        23,          1,       1.01
  16,  4081,   15,         0,          1,      0.997
  16,     0,    1,        23,          1,      0.722
  16,     0,    2,         0,          1,      0.725
  17,     0,   16,        23,          1,      1.167
  17,     0,   16,         0,          1,      1.167
  17,    16,   16,        23,          1,      1.167
  17,    16,   16,         0,          1,      1.167
  15,     0,   16,        23,          1,      0.891
  15,     0,   16,         0,          1,      0.892
  15,    16,   16,        23,          1,      0.668
  15,    16,   16,         0,          1,      0.699
  17,  2048,   16,        23,          1,      1.167
  17,  2048,   16,         0,          1,      1.167
  17,  2064,   16,        23,          1,      1.167
  17,  2064,   16,         0,          1,      1.167
  15,  2048,   16,        23,          1,      0.668
  15,  2048,   16,         0,          1,      0.667
  15,  2064,   16,        23,          1,      0.667
  15,  2064,   16,         0,          1,      0.771
  15,  4081,   16,        23,          1,      0.933
  15,  4081,   16,         0,          1,      1.056
  17,  4081,   16,        23,          1,       1.78
  17,  4081,   16,         0,          1,      1.789
  17,     0,    1,        23,          1,       1.17
  17,     0,    2,         0,          1,      1.169
  18,     0,   17,        23,          1,      0.859
  18,     0,   17,         0,          1,      0.857
  18,    17,   17,        23,          1,      0.857
  18,    17,   17,         0,          1,      0.857
  16,     0,   17,        23,          1,      0.673
  16,     0,   17,         0,          1,      0.672
  16,    17,   17,        23,          1,      0.628
  16,    17,   17,         0,          1,      0.628
  18,  2048,   17,        23,          1,      0.861
  18,  2048,   17,         0,          1,      0.859
  18,  2065,   17,        23,          1,       0.86
  18,  2065,   17,         0,          1,      0.857
  16,  2048,   17,        23,          1,      0.668
  16,  2048,   17,         0,          1,      0.668
  16,  2065,   17,        23,          1,      0.627
  16,  2065,   17,         0,          1,      0.627
  16,  4081,   17,        23,          1,      1.049
  16,  4081,   17,         0,          1,      1.174
  18,  4081,   17,        23,          1,      1.068
  18,  4081,   17,         0,          1,      1.064
  18,     0,    1,        23,          1,      1.172
  18,     0,    2,         0,          1,      1.172
  19,     0,   18,        23,          1,      0.865
  19,     0,   18,         0,          1,      0.864
  19,    18,   18,        23,          1,       0.86
  19,    18,   18,         0,          1,      0.861
  17,     0,   18,        23,          1,      0.895
  17,     0,   18,         0,          1,      0.895
  17,    18,   18,        23,          1,      0.896
  17,    18,   18,         0,          1,      0.836
  19,  2048,   18,        23,          1,      0.866
  19,  2048,   18,         0,          1,      0.866
  19,  2066,   18,        23,          1,      0.866
  19,  2066,   18,         0,          1,      0.863
  17,  2048,   18,        23,          1,      0.896
  17,  2048,   18,         0,          1,      0.895
  17,  2066,   18,        23,          1,      0.895
  17,  2066,   18,         0,          1,      0.877
  17,  4081,   18,        23,          1,      1.115
  17,  4081,   18,         0,          1,       1.07
  19,  4081,   18,        23,          1,      1.061
  19,  4081,   18,         0,          1,       1.06
  19,     0,    1,        23,          1,      1.168
  19,     0,    2,         0,          1,      1.168
  20,     0,   19,        23,          1,      0.855
  20,     0,   19,         0,          1,      0.858
  20,    19,   19,        23,          1,      0.856
  20,    19,   19,         0,          1,      0.855
  18,     0,   19,        23,          1,       0.89
  18,     0,   19,         0,          1,       0.89
  18,    19,   19,        23,          1,       0.89
  18,    19,   19,         0,          1,      0.875
  20,  2048,   19,        23,          1,      0.859
  20,  2048,   19,         0,          1,      0.855
  20,  2067,   19,        23,          1,      0.854
  20,  2067,   19,         0,          1,      0.856
  18,  2048,   19,        23,          1,      0.889
  18,  2048,   19,         0,          1,      0.889
  18,  2067,   19,        23,          1,      0.889
  18,  2067,   19,         0,          1,      0.893
  18,  4081,   19,        23,          1,      1.109
  18,  4081,   19,         0,          1,      1.067
  20,  4081,   19,        23,          1,      1.053
  20,  4081,   19,         0,          1,      1.052
  20,     0,    1,        23,          1,      1.165
  20,     0,    2,         0,          1,      1.166
  21,     0,   20,        23,          1,      0.855
  21,     0,   20,         0,          1,      0.856
  21,    20,   20,        23,          1,      0.854
  21,    20,   20,         0,          1,      0.854
  19,     0,   20,        23,          1,      0.888
  19,     0,   20,         0,          1,      0.888
  19,    20,   20,        23,          1,      0.888
  19,    20,   20,         0,          1,      0.868
  21,  2048,   20,        23,          1,      0.853
  21,  2048,   20,         0,          1,      0.857
  21,  2068,   20,        23,          1,      0.855
  21,  2068,   20,         0,          1,      0.854
  19,  2048,   20,        23,          1,      0.889
  19,  2048,   20,         0,          1,      0.889
  19,  2068,   20,        23,          1,      0.889
  19,  2068,   20,         0,          1,      0.894
  19,  4081,   20,        23,          1,      1.112
  19,  4081,   20,         0,          1,      1.103
  21,  4081,   20,        23,          1,      1.054
  21,  4081,   20,         0,          1,      1.051
  21,     0,    1,        23,          1,      1.169
  21,     0,    2,         0,          1,      1.168
  22,     0,   21,        23,          1,      0.853
  22,     0,   21,         0,          1,      0.855
  22,    21,   21,        23,          1,      0.856
  22,    21,   21,         0,          1,      0.853
  20,     0,   21,        23,          1,      0.889
  20,     0,   21,         0,          1,      0.889
  20,    21,   21,        23,          1,      0.889
  20,    21,   21,         0,          1,       0.91
  22,  2048,   21,        23,          1,      0.852
  22,  2048,   21,         0,          1,      0.855
  22,  2069,   21,        23,          1,      0.853
  22,  2069,   21,         0,          1,      0.854
  20,  2048,   21,        23,          1,      0.889
  20,  2048,   21,         0,          1,      0.889
  20,  2069,   21,        23,          1,      0.889
  20,  2069,   21,         0,          1,      0.925
  20,  4081,   21,        23,          1,      1.111
  20,  4081,   21,         0,          1,      1.111
  22,  4081,   21,        23,          1,      1.053
  22,  4081,   21,         0,          1,      1.051
  22,     0,    1,        23,          1,      1.167
  22,     0,    2,         0,          1,      1.165
  23,     0,   22,        23,          1,      0.853
  23,     0,   22,         0,          1,      0.853
  23,    22,   22,        23,          1,      0.853
  23,    22,   22,         0,          1,      0.853
  21,     0,   22,        23,          1,      0.888
  21,     0,   22,         0,          1,      0.888
  21,    22,   22,        23,          1,      0.889
  21,    22,   22,         0,          1,      0.931
  23,  2048,   22,        23,          1,      0.854
  23,  2048,   22,         0,          1,      0.854
  23,  2070,   22,        23,          1,      0.853
  23,  2070,   22,         0,          1,      0.852
  21,  2048,   22,        23,          1,      0.887
  21,  2048,   22,         0,          1,      0.887
  21,  2070,   22,        23,          1,      0.887
  21,  2070,   22,         0,          1,      0.901
  21,  4081,   22,        23,          1,      1.107
  21,  4081,   22,         0,          1,       1.11
  23,  4081,   22,        23,          1,      1.047
  23,  4081,   22,         0,          1,      1.049
  23,     0,    1,        23,          1,      1.163
  23,     0,    2,         0,          1,      1.163
  24,     0,   23,        23,          1,      0.851
  24,     0,   23,         0,          1,      0.852
  24,    23,   23,        23,          1,      0.852
  24,    23,   23,         0,          1,      0.854
  22,     0,   23,        23,          1,      0.888
  22,     0,   23,         0,          1,      0.888
  22,    23,   23,        23,          1,      0.888
  22,    23,   23,         0,          1,      0.908
  24,  2048,   23,        23,          1,      0.853
  24,  2048,   23,         0,          1,      0.851
  24,  2071,   23,        23,          1,      0.851
  24,  2071,   23,         0,          1,      0.851
  22,  2048,   23,        23,          1,      0.888
  22,  2048,   23,         0,          1,      0.888
  22,  2071,   23,        23,          1,      0.888
  22,  2071,   23,         0,          1,      0.882
  22,  4081,   23,        23,          1,      1.109
  22,  4081,   23,         0,          1,      1.084
  24,  4081,   23,        23,          1,      1.049
  24,  4081,   23,         0,          1,      1.049
  24,     0,    1,        23,          1,      1.164
  24,     0,    2,         0,          1,      1.164
  25,     0,   24,        23,          1,      0.855
  25,     0,   24,         0,          1,      0.849
  25,    24,   24,        23,          1,      0.859
  25,    24,   24,         0,          1,      0.861
  23,     0,   24,        23,          1,      0.885
  23,     0,   24,         0,          1,      0.885
  23,    24,   24,        23,          1,      0.887
  23,    24,   24,         0,          1,      0.898
  25,  2048,   24,        23,          1,      0.851
  25,  2048,   24,         0,          1,      0.852
  25,  2072,   24,        23,          1,      0.852
  25,  2072,   24,         0,          1,      0.852
  23,  2048,   24,        23,          1,      0.886
  23,  2048,   24,         0,          1,      0.886
  23,  2072,   24,        23,          1,      0.886
  23,  2072,   24,         0,          1,      0.916
  23,  4081,   24,        23,          1,      1.106
  23,  4081,   24,         0,          1,      1.078
  25,  4081,   24,        23,          1,      1.044
  25,  4081,   24,         0,          1,      1.045
  25,     0,    1,        23,          1,      1.163
  25,     0,    2,         0,          1,      1.163
  26,     0,   25,        23,          1,      0.849
  26,     0,   25,         0,          1,      0.851
  26,    25,   25,        23,          1,      0.844
  26,    25,   25,         0,          1,      0.849
  24,     0,   25,        23,          1,      0.885
  24,     0,   25,         0,          1,      0.886
  24,    25,   25,        23,          1,      0.875
  24,    25,   25,         0,          1,      0.845
  26,  2048,   25,        23,          1,       0.85
  26,  2048,   25,         0,          1,      0.849
  26,  2073,   25,        23,          1,      0.862
  26,  2073,   25,         0,          1,      0.861
  24,  2048,   25,        23,          1,      0.886
  24,  2048,   25,         0,          1,      0.885
  24,  2073,   25,        23,          1,      0.862
  24,  2073,   25,         0,          1,      0.836
  24,  4081,   25,        23,          1,      1.105
  24,  4081,   25,         0,          1,      1.088
  26,  4081,   25,        23,          1,      1.047
  26,  4081,   25,         0,          1,      1.045
  26,     0,    1,        23,          1,      1.163
  26,     0,    2,         0,          1,      1.163
  27,     0,   26,        23,          1,      0.853
  27,     0,   26,         0,          1,      0.853
  27,    26,   26,        23,          1,       0.85
  27,    26,   26,         0,          1,       0.86
  25,     0,   26,        23,          1,      0.888
  25,     0,   26,         0,          1,      0.887
  25,    26,   26,        23,          1,      0.867
  25,    26,   26,         0,          1,      0.844
  27,  2048,   26,        23,          1,      0.852
  27,  2048,   26,         0,          1,      0.851
  27,  2074,   26,        23,          1,      0.872
  27,  2074,   26,         0,          1,      0.878
  25,  2048,   26,        23,          1,      0.889
  25,  2048,   26,         0,          1,      0.888
  25,  2074,   26,        23,          1,      0.868
  25,  2074,   26,         0,          1,      0.854
  25,  4081,   26,        23,          1,      1.109
  25,  4081,   26,         0,          1,      1.102
  27,  4081,   26,        23,          1,      1.046
  27,  4081,   26,         0,          1,      1.049
  27,     0,    1,        23,          1,      1.165
  27,     0,    2,         0,          1,      1.165
  28,     0,   27,        23,          1,      0.853
  28,     0,   27,         0,          1,      0.854
  28,    27,   27,        23,          1,      0.873
  28,    27,   27,         0,          1,      0.878
  26,     0,   27,        23,          1,      0.887
  26,     0,   27,         0,          1,      0.888
  26,    27,   27,        23,          1,      0.875
  26,    27,   27,         0,          1,      0.851
  28,  2048,   27,        23,          1,      0.851
  28,  2048,   27,         0,          1,      0.851
  28,  2075,   27,        23,          1,      0.879
  28,  2075,   27,         0,          1,      0.883
  26,  2048,   27,        23,          1,      0.888
  26,  2048,   27,         0,          1,      0.888
  26,  2075,   27,        23,          1,      0.876
  26,  2075,   27,         0,          1,       0.86
  26,  4081,   27,        23,          1,      1.109
  26,  4081,   27,         0,          1,      1.105
  28,  4081,   27,        23,          1,      1.048
  28,  4081,   27,         0,          1,      1.048
  28,     0,    1,        23,          1,      1.164
  28,     0,    2,         0,          1,      1.165
  29,     0,   28,        23,          1,      0.854
  29,     0,   28,         0,          1,      0.852
  29,    28,   28,        23,          1,      0.887
  29,    28,   28,         0,          1,      0.884
  27,     0,   28,        23,          1,      0.887
  27,     0,   28,         0,          1,      0.889
  27,    28,   28,        23,          1,      0.885
  27,    28,   28,         0,          1,      0.866
  29,  2048,   28,        23,          1,      0.853
  29,  2048,   28,         0,          1,      0.852
  29,  2076,   28,        23,          1,      0.879
  29,  2076,   28,         0,          1,      0.876
  27,  2048,   28,        23,          1,      0.889
  27,  2048,   28,         0,          1,      0.891
  27,  2076,   28,        23,          1,      0.883
  27,  2076,   28,         0,          1,       0.86
  27,  4081,   28,        23,          1,       1.11
  27,  4081,   28,         0,          1,      1.106
  29,  4081,   28,        23,          1,      1.051
  29,  4081,   28,         0,          1,      1.052
  29,     0,    1,        23,          1,      1.168
  29,     0,    2,         0,          1,      1.168
  30,     0,   29,        23,          1,      0.856
  30,     0,   29,         0,          1,      0.854
  30,    29,   29,        23,          1,      0.873
  30,    29,   29,         0,          1,      0.874
  28,     0,   29,        23,          1,      0.891
  28,     0,   29,         0,          1,      0.891
  28,    29,   29,        23,          1,      0.884
  28,    29,   29,         0,          1,      0.872
  30,  2048,   29,        23,          1,      0.859
  30,  2048,   29,         0,          1,      0.856
  30,  2077,   29,        23,          1,      0.879
  30,  2077,   29,         0,          1,      0.878
  28,  2048,   29,        23,          1,      0.891
  28,  2048,   29,         0,          1,      0.891
  28,  2077,   29,        23,          1,      0.889
  28,  2077,   29,         0,          1,      0.863
  28,  4081,   29,        23,          1,      1.109
  28,  4081,   29,         0,          1,      1.122
  30,  4081,   29,        23,          1,      1.054
  30,  4081,   29,         0,          1,      1.052
  30,     0,    1,        23,          1,      1.163
  30,     0,    2,         0,          1,      1.161
  31,     0,   30,        23,          1,      0.851
  31,     0,   30,         0,          1,      0.849
  31,    30,   30,        23,          1,      0.871
  31,    30,   30,         0,          1,      0.874
  29,     0,   30,        23,          1,      0.884
  29,     0,   30,         0,          1,      0.885
  29,    30,   30,        23,          1,      0.888
  29,    30,   30,         0,          1,      0.864
  31,  2048,   30,        23,          1,      0.854
  31,  2048,   30,         0,          1,      0.852
  31,  2078,   30,        23,          1,      0.874
  31,  2078,   30,         0,          1,      0.882
  29,  2048,   30,        23,          1,      0.888
  29,  2048,   30,         0,          1,      0.889
  29,  2078,   30,        23,          1,      0.895
  29,  2078,   30,         0,          1,      0.878
  29,  4081,   30,        23,          1,      1.109
  29,  4081,   30,         0,          1,      1.128
  31,  4081,   30,        23,          1,      0.804
  31,  4081,   30,         0,          1,      0.803
  31,     0,    1,        23,          1,      1.167
  31,     0,    2,         0,          1,      1.167
  32,     0,   31,        23,          1,      0.802
  32,     0,   31,         0,          1,      0.802
  32,    31,   31,        23,          1,      0.798
  32,    31,   31,         0,          1,      0.797
  30,     0,   31,        23,          1,       0.88
  30,     0,   31,         0,          1,      0.888
  30,    31,   31,        23,          1,       0.96
  30,    31,   31,         0,          1,      0.869
  32,  2048,   31,        23,          1,      0.802
  32,  2048,   31,         0,          1,      0.802
  32,  2079,   31,        23,          1,      0.843
  32,  2079,   31,         0,          1,      0.835
  30,  2048,   31,        23,          1,      0.889
  30,  2048,   31,         0,          1,      0.889
  30,  2079,   31,        23,          1,      0.937
  30,  2079,   31,         0,          1,      0.872
  30,  4081,   31,        23,          1,       1.11
  30,  4081,   31,         0,          1,      1.142
  32,  4081,   31,        23,          1,      0.864
  32,  4081,   31,         0,          1,      0.872
  32,     0,    1,        23,          1,      1.167
  32,     0,    2,         0,          1,      1.167

>  sysdeps/x86_64/memrchr.S | 613 +++++++++++++++++++--------------------
>  1 file changed, 292 insertions(+), 321 deletions(-)
>
> diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S
> index d1a9f47911..b0dffd2ae2 100644
> --- a/sysdeps/x86_64/memrchr.S
> +++ b/sysdeps/x86_64/memrchr.S
> @@ -18,362 +18,333 @@
>     <https://www.gnu.org/licenses/>.  */
>
>  #include <sysdep.h>
> +#define VEC_SIZE                       16
> +#define PAGE_SIZE                      4096
>
>         .text
> -ENTRY (__memrchr)
> -       movd    %esi, %xmm1
> -
> -       sub     $16, %RDX_LP
> -       jbe     L(length_less16)
> -
> -       punpcklbw       %xmm1, %xmm1
> -       punpcklbw       %xmm1, %xmm1
> -
> -       add     %RDX_LP, %RDI_LP
> -       pshufd  $0, %xmm1, %xmm1
> -
> -       movdqu  (%rdi), %xmm0
> -       pcmpeqb %xmm1, %xmm0
> -
> -/* Check if there is a match.  */
> -       pmovmskb        %xmm0, %eax
> -       test    %eax, %eax
> -       jnz     L(matches0)
> -
> -       sub     $64, %rdi
> -       mov     %edi, %ecx
> -       and     $15, %ecx
> -       jz      L(loop_prolog)
> -
> -       add     $16, %rdi
> -       add     $16, %rdx
> -       and     $-16, %rdi
> -       sub     %rcx, %rdx
> -
> -       .p2align 4
> -L(loop_prolog):
> -       sub     $64, %rdx
> -       jbe     L(exit_loop)
> -
> -       movdqa  48(%rdi), %xmm0
> -       pcmpeqb %xmm1, %xmm0
> -       pmovmskb        %xmm0, %eax
> -       test    %eax, %eax
> -       jnz     L(matches48)
> -
> -       movdqa  32(%rdi), %xmm2
> -       pcmpeqb %xmm1, %xmm2
> -       pmovmskb        %xmm2, %eax
> -       test    %eax, %eax
> -       jnz     L(matches32)
> -
> -       movdqa  16(%rdi), %xmm3
> -       pcmpeqb %xmm1, %xmm3
> -       pmovmskb        %xmm3, %eax
> -       test    %eax, %eax
> -       jnz     L(matches16)
> -
> -       movdqa  (%rdi), %xmm4
> -       pcmpeqb %xmm1, %xmm4
> -       pmovmskb        %xmm4, %eax
> -       test    %eax, %eax
> -       jnz     L(matches0)
> -
> -       sub     $64, %rdi
> -       sub     $64, %rdx
> -       jbe     L(exit_loop)
> -
> -       movdqa  48(%rdi), %xmm0
> -       pcmpeqb %xmm1, %xmm0
> -       pmovmskb        %xmm0, %eax
> -       test    %eax, %eax
> -       jnz     L(matches48)
> -
> -       movdqa  32(%rdi), %xmm2
> -       pcmpeqb %xmm1, %xmm2
> -       pmovmskb        %xmm2, %eax
> -       test    %eax, %eax
> -       jnz     L(matches32)
> -
> -       movdqa  16(%rdi), %xmm3
> -       pcmpeqb %xmm1, %xmm3
> -       pmovmskb        %xmm3, %eax
> -       test    %eax, %eax
> -       jnz     L(matches16)
> -
> -       movdqa  (%rdi), %xmm3
> -       pcmpeqb %xmm1, %xmm3
> -       pmovmskb        %xmm3, %eax
> -       test    %eax, %eax
> -       jnz     L(matches0)
> -
> -       mov     %edi, %ecx
> -       and     $63, %ecx
> -       jz      L(align64_loop)
> -
> -       add     $64, %rdi
> -       add     $64, %rdx
> -       and     $-64, %rdi
> -       sub     %rcx, %rdx
> -
> -       .p2align 4
> -L(align64_loop):
> -       sub     $64, %rdi
> -       sub     $64, %rdx
> -       jbe     L(exit_loop)
> -
> -       movdqa  (%rdi), %xmm0
> -       movdqa  16(%rdi), %xmm2
> -       movdqa  32(%rdi), %xmm3
> -       movdqa  48(%rdi), %xmm4
> -
> -       pcmpeqb %xmm1, %xmm0
> -       pcmpeqb %xmm1, %xmm2
> -       pcmpeqb %xmm1, %xmm3
> -       pcmpeqb %xmm1, %xmm4
> -
> -       pmaxub  %xmm3, %xmm0
> -       pmaxub  %xmm4, %xmm2
> -       pmaxub  %xmm0, %xmm2
> -       pmovmskb        %xmm2, %eax
> -
> -       test    %eax, %eax
> -       jz      L(align64_loop)
> -
> -       pmovmskb        %xmm4, %eax
> -       test    %eax, %eax
> -       jnz     L(matches48)
> -
> -       pmovmskb        %xmm3, %eax
> -       test    %eax, %eax
> -       jnz     L(matches32)
> -
> -       movdqa  16(%rdi), %xmm2
> -
> -       pcmpeqb %xmm1, %xmm2
> -       pcmpeqb (%rdi), %xmm1
> -
> -       pmovmskb        %xmm2, %eax
> -       test    %eax, %eax
> -       jnz     L(matches16)
> -
> -       pmovmskb        %xmm1, %eax
> -       bsr     %eax, %eax
> -
> -       add     %rdi, %rax
> +ENTRY_P2ALIGN(__memrchr, 6)
> +#ifdef __ILP32__
> +       /* Clear upper bits.  */
> +       mov     %RDX_LP, %RDX_LP
> +#endif
> +       movd    %esi, %xmm0
> +
> +       /* Get end pointer.  */
> +       leaq    (%rdx, %rdi), %rcx
> +
> +       punpcklbw %xmm0, %xmm0
> +       punpcklwd %xmm0, %xmm0
> +       pshufd  $0, %xmm0, %xmm0
> +
> +       /* Check if we can load 1x VEC without cross a page.  */
> +       testl   $(PAGE_SIZE - VEC_SIZE), %ecx
> +       jz      L(page_cross)
> +
> +       /* NB: This load happens regardless of whether rdx (len) is zero. Since
> +          it doesn't cross a page and the standard gurantees any pointer have
> +          at least one-valid byte this load must be safe. For the entire
> +          history of the x86 memrchr implementation this has been possible so
> +          no code "should" be relying on a zero-length check before this load.
> +          The zero-length check is moved to the page cross case because it is
> +          1) pretty cold and including it pushes the hot case len <= VEC_SIZE
> +          into 2-cache lines.  */
> +       movups  -(VEC_SIZE)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +
> +       subq    $VEC_SIZE, %rdx
> +       ja      L(more_1x_vec)
> +L(ret_vec_x0_test):
> +       /* Zero-flag set if eax (src) is zero. Destination unchanged if src is
> +          zero.  */
> +       bsrl    %eax, %eax
> +       jz      L(ret_0)
> +       /* Check if the CHAR match is in bounds. Need to truly zero `eax` here
> +          if out of bounds.  */
> +       addl    %edx, %eax
> +       jl      L(zero_0)
> +       /* Since we subtracted VEC_SIZE from rdx earlier we can just add to base
> +          ptr.  */
> +       addq    %rdi, %rax
> +L(ret_0):
>         ret
>
> -       .p2align 4
> -L(exit_loop):
> -       add     $64, %edx
> -       cmp     $32, %edx
> -       jbe     L(exit_loop_32)
> -
> -       movdqa  48(%rdi), %xmm0
> -       pcmpeqb %xmm1, %xmm0
> -       pmovmskb        %xmm0, %eax
> -       test    %eax, %eax
> -       jnz     L(matches48)
> -
> -       movdqa  32(%rdi), %xmm2
> -       pcmpeqb %xmm1, %xmm2
> -       pmovmskb        %xmm2, %eax
> -       test    %eax, %eax
> -       jnz     L(matches32)
> -
> -       movdqa  16(%rdi), %xmm3
> -       pcmpeqb %xmm1, %xmm3
> -       pmovmskb        %xmm3, %eax
> -       test    %eax, %eax
> -       jnz     L(matches16_1)
> -       cmp     $48, %edx
> -       jbe     L(return_null)
> -
> -       pcmpeqb (%rdi), %xmm1
> -       pmovmskb        %xmm1, %eax
> -       test    %eax, %eax
> -       jnz     L(matches0_1)
> -       xor     %eax, %eax
> +       .p2align 4,, 5
> +L(ret_vec_x0):
> +       bsrl    %eax, %eax
> +       leaq    -(VEC_SIZE)(%rcx, %rax), %rax
>         ret
>
> -       .p2align 4
> -L(exit_loop_32):
> -       movdqa  48(%rdi), %xmm0
> -       pcmpeqb %xmm1, %xmm0
> -       pmovmskb        %xmm0, %eax
> -       test    %eax, %eax
> -       jnz     L(matches48_1)
> -       cmp     $16, %edx
> -       jbe     L(return_null)
> -
> -       pcmpeqb 32(%rdi), %xmm1
> -       pmovmskb        %xmm1, %eax
> -       test    %eax, %eax
> -       jnz     L(matches32_1)
> -       xor     %eax, %eax
> +       .p2align 4,, 2
> +L(zero_0):
> +       xorl    %eax, %eax
>         ret
>
> -       .p2align 4
> -L(matches0):
> -       bsr     %eax, %eax
> -       add     %rdi, %rax
> -       ret
> -
> -       .p2align 4
> -L(matches16):
> -       bsr     %eax, %eax
> -       lea     16(%rax, %rdi), %rax
> -       ret
>
> -       .p2align 4
> -L(matches32):
> -       bsr     %eax, %eax
> -       lea     32(%rax, %rdi), %rax
> +       .p2align 4,, 8
> +L(more_1x_vec):
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_x0)
> +
> +       /* Align rcx (pointer to string).  */
> +       decq    %rcx
> +       andq    $-VEC_SIZE, %rcx
> +
> +       movq    %rcx, %rdx
> +       /* NB: We could consistenyl save 1-byte in this pattern with `movaps
> +          %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is
> +          it adds more frontend uops (even if the moves can be eliminated) and
> +          some percentage of the time actual backend uops.  */
> +       movaps  -(VEC_SIZE)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       subq    %rdi, %rdx
> +       pmovmskb %xmm1, %eax
> +
> +       cmpq    $(VEC_SIZE * 2), %rdx
> +       ja      L(more_2x_vec)
> +L(last_2x_vec):
> +       subl    $VEC_SIZE, %edx
> +       jbe     L(ret_vec_x0_test)
> +
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_x0)
> +
> +       movaps  -(VEC_SIZE * 2)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +
> +       subl    $VEC_SIZE, %edx
> +       bsrl    %eax, %eax
> +       jz      L(ret_1)
> +       addl    %edx, %eax
> +       jl      L(zero_0)
> +       addq    %rdi, %rax
> +L(ret_1):
>         ret
>
> -       .p2align 4
> -L(matches48):
> -       bsr     %eax, %eax
> -       lea     48(%rax, %rdi), %rax
> +       /* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross)
> +          causes the hot pause (length <= VEC_SIZE) to span multiple cache
> +          lines.  Naturally aligned % 16 to 8-bytes.  */
> +L(page_cross):
> +       /* Zero length check.  */
> +       testq   %rdx, %rdx
> +       jz      L(zero_0)
> +
> +       leaq    -1(%rcx), %r8
> +       andq    $-(VEC_SIZE), %r8
> +
> +       movaps  (%r8), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %esi
> +       /* Shift out negative alignment (because we are starting from endptr and
> +          working backwards).  */
> +       negl    %ecx
> +       /* 32-bit shift but VEC_SIZE=16 so need to mask the shift count
> +          explicitly.  */
> +       andl    $(VEC_SIZE - 1), %ecx
> +       shl     %cl, %esi
> +       movzwl  %si, %eax
> +       leaq    (%rdi, %rdx), %rcx
> +       cmpq    %rdi, %r8
> +       ja      L(more_1x_vec)
> +       subl    $VEC_SIZE, %edx
> +       bsrl    %eax, %eax
> +       jz      L(ret_2)
> +       addl    %edx, %eax
> +       jl      L(zero_1)
> +       addq    %rdi, %rax
> +L(ret_2):
>         ret
>
> -       .p2align 4
> -L(matches0_1):
> -       bsr     %eax, %eax
> -       sub     $64, %rdx
> -       add     %rax, %rdx
> -       jl      L(return_null)
> -       add     %rdi, %rax
> +       /* Fits in aliging bytes.  */
> +L(zero_1):
> +       xorl    %eax, %eax
>         ret
>
> -       .p2align 4
> -L(matches16_1):
> -       bsr     %eax, %eax
> -       sub     $48, %rdx
> -       add     %rax, %rdx
> -       jl      L(return_null)
> -       lea     16(%rdi, %rax), %rax
> +       .p2align 4,, 5
> +L(ret_vec_x1):
> +       bsrl    %eax, %eax
> +       leaq    -(VEC_SIZE * 2)(%rcx, %rax), %rax
>         ret
>
> -       .p2align 4
> -L(matches32_1):
> -       bsr     %eax, %eax
> -       sub     $32, %rdx
> -       add     %rax, %rdx
> -       jl      L(return_null)
> -       lea     32(%rdi, %rax), %rax
> -       ret
> +       .p2align 4,, 8
> +L(more_2x_vec):
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_x0)
>
> -       .p2align 4
> -L(matches48_1):
> -       bsr     %eax, %eax
> -       sub     $16, %rdx
> -       add     %rax, %rdx
> -       jl      L(return_null)
> -       lea     48(%rdi, %rax), %rax
> -       ret
> +       movaps  -(VEC_SIZE * 2)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_x1)
>
> -       .p2align 4
> -L(return_null):
> -       xor     %eax, %eax
> -       ret
>
> -       .p2align 4
> -L(length_less16_offset0):
> -       test    %edx, %edx
> -       jz      L(return_null)
> +       movaps  -(VEC_SIZE * 3)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
>
> -       mov     %dl, %cl
> -       pcmpeqb (%rdi), %xmm1
> +       subq    $(VEC_SIZE * 4), %rdx
> +       ja      L(more_4x_vec)
>
> -       mov     $1, %edx
> -       sal     %cl, %edx
> -       sub     $1, %edx
> +       addl    $(VEC_SIZE), %edx
> +       jle     L(ret_vec_x2_test)
>
> -       pmovmskb        %xmm1, %eax
> +L(last_vec):
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_x2)
>
> -       and     %edx, %eax
> -       test    %eax, %eax
> -       jz      L(return_null)
> +       movaps  -(VEC_SIZE * 4)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
>
> -       bsr     %eax, %eax
> -       add     %rdi, %rax
> +       subl    $(VEC_SIZE), %edx
> +       bsrl    %eax, %eax
> +       jz      L(ret_3)
> +       addl    %edx, %eax
> +       jl      L(zero_2)
> +       addq    %rdi, %rax
> +L(ret_3):
>         ret
>
> -       .p2align 4
> -L(length_less16):
> -       punpcklbw       %xmm1, %xmm1
> -       punpcklbw       %xmm1, %xmm1
> -
> -       add     $16, %edx
> -
> -       pshufd  $0, %xmm1, %xmm1
> -
> -       mov     %edi, %ecx
> -       and     $15, %ecx
> -       jz      L(length_less16_offset0)
> -
> -       mov     %cl, %dh
> -       mov     %ecx, %esi
> -       add     %dl, %dh
> -       and     $-16, %rdi
> -
> -       sub     $16, %dh
> -       ja      L(length_less16_part2)
> -
> -       pcmpeqb (%rdi), %xmm1
> -       pmovmskb        %xmm1, %eax
> -
> -       sar     %cl, %eax
> -       mov     %dl, %cl
> -
> -       mov     $1, %edx
> -       sal     %cl, %edx
> -       sub     $1, %edx
> -
> -       and     %edx, %eax
> -       test    %eax, %eax
> -       jz      L(return_null)
> -
> -       bsr     %eax, %eax
> -       add     %rdi, %rax
> -       add     %rsi, %rax
> +       .p2align 4,, 6
> +L(ret_vec_x2_test):
> +       bsrl    %eax, %eax
> +       jz      L(zero_2)
> +       addl    %edx, %eax
> +       jl      L(zero_2)
> +       addq    %rdi, %rax
>         ret
>
> -       .p2align 4
> -L(length_less16_part2):
> -       movdqa  16(%rdi), %xmm2
> -       pcmpeqb %xmm1, %xmm2
> -       pmovmskb        %xmm2, %eax
> -
> -       mov     %dh, %cl
> -       mov     $1, %edx
> -       sal     %cl, %edx
> -       sub     $1, %edx
> -
> -       and     %edx, %eax
> +L(zero_2):
> +       xorl    %eax, %eax
> +       ret
>
> -       test    %eax, %eax
> -       jnz     L(length_less16_part2_return)
>
> -       pcmpeqb (%rdi), %xmm1
> -       pmovmskb        %xmm1, %eax
> +       .p2align 4,, 5
> +L(ret_vec_x2):
> +       bsrl    %eax, %eax
> +       leaq    -(VEC_SIZE * 3)(%rcx, %rax), %rax
> +       ret
>
> -       mov     %esi, %ecx
> -       sar     %cl, %eax
> -       test    %eax, %eax
> -       jz      L(return_null)
> +       .p2align 4,, 5
> +L(ret_vec_x3):
> +       bsrl    %eax, %eax
> +       leaq    -(VEC_SIZE * 4)(%rcx, %rax), %rax
> +       ret
>
> -       bsr     %eax, %eax
> -       add     %rdi, %rax
> -       add     %rsi, %rax
> +       .p2align 4,, 8
> +L(more_4x_vec):
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_x2)
> +
> +       movaps  -(VEC_SIZE * 4)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_x3)
> +
> +       addq    $-(VEC_SIZE * 4), %rcx
> +       cmpq    $(VEC_SIZE * 4), %rdx
> +       jbe     L(last_4x_vec)
> +
> +       /* Offset everything by 4x VEC_SIZE here to save a few bytes at the end
> +          keeping the code from spilling to the next cache line.  */
> +       addq    $(VEC_SIZE * 4 - 1), %rcx
> +       andq    $-(VEC_SIZE * 4), %rcx
> +       leaq    (VEC_SIZE * 4)(%rdi), %rdx
> +       andq    $-(VEC_SIZE * 4), %rdx
> +
> +       .p2align 4,, 11
> +L(loop_4x_vec):
> +       movaps  (VEC_SIZE * -1)(%rcx), %xmm1
> +       movaps  (VEC_SIZE * -2)(%rcx), %xmm2
> +       movaps  (VEC_SIZE * -3)(%rcx), %xmm3
> +       movaps  (VEC_SIZE * -4)(%rcx), %xmm4
> +       pcmpeqb %xmm0, %xmm1
> +       pcmpeqb %xmm0, %xmm2
> +       pcmpeqb %xmm0, %xmm3
> +       pcmpeqb %xmm0, %xmm4
> +
> +       por     %xmm1, %xmm2
> +       por     %xmm3, %xmm4
> +       por     %xmm2, %xmm4
> +
> +       pmovmskb %xmm4, %esi
> +       testl   %esi, %esi
> +       jnz     L(loop_end)
> +
> +       addq    $-(VEC_SIZE * 4), %rcx
> +       cmpq    %rdx, %rcx
> +       jne     L(loop_4x_vec)
> +
> +       subl    %edi, %edx
> +
> +       /* Ends up being 1-byte nop.  */
> +       .p2align 4,, 2
> +L(last_4x_vec):
> +       movaps  -(VEC_SIZE)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +
> +       cmpl    $(VEC_SIZE * 2), %edx
> +       jbe     L(last_2x_vec)
> +
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_x0)
> +
> +
> +       movaps  -(VEC_SIZE * 2)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_end)
> +
> +       movaps  -(VEC_SIZE * 3)(%rcx), %xmm1
> +       pcmpeqb %xmm0, %xmm1
> +       pmovmskb %xmm1, %eax
> +
> +       subl    $(VEC_SIZE * 3), %edx
> +       ja      L(last_vec)
> +       bsrl    %eax, %eax
> +       jz      L(ret_4)
> +       addl    %edx, %eax
> +       jl      L(zero_3)
> +       addq    %rdi, %rax
> +L(ret_4):
>         ret
>
> -       .p2align 4
> -L(length_less16_part2_return):
> -       bsr     %eax, %eax
> -       lea     16(%rax, %rdi), %rax
> +       /* Ends up being 1-byte nop.  */
> +       .p2align 4,, 3
> +L(loop_end):
> +       pmovmskb %xmm1, %eax
> +       sall    $16, %eax
> +       jnz     L(ret_vec_end)
> +
> +       pmovmskb %xmm2, %eax
> +       testl   %eax, %eax
> +       jnz     L(ret_vec_end)
> +
> +       pmovmskb %xmm3, %eax
> +       /* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
> +          then it won't affect the result in esi (VEC4). If ecx is non-zero
> +          then CHAR in VEC3 and bsrq will use that position.  */
> +       sall    $16, %eax
> +       orl     %esi, %eax
> +       bsrl    %eax, %eax
> +       leaq    -(VEC_SIZE * 4)(%rcx, %rax), %rax
>         ret
>
> -END (__memrchr)
> +L(ret_vec_end):
> +       bsrl    %eax, %eax
> +       leaq    (VEC_SIZE * -2)(%rax, %rcx), %rax
> +       ret
> +       /* Use in L(last_4x_vec). In the same cache line. This is just a spare
> +          aligning bytes.  */
> +L(zero_3):
> +       xorl    %eax, %eax
> +       ret
> +       /* 2-bytes from next cache line.  */
> +END(__memrchr)
>  weak_alias (__memrchr, memrchr)
> --
> 2.34.1
>
diff mbox series

Patch

diff --git a/sysdeps/x86_64/memrchr.S b/sysdeps/x86_64/memrchr.S
index d1a9f47911..b0dffd2ae2 100644
--- a/sysdeps/x86_64/memrchr.S
+++ b/sysdeps/x86_64/memrchr.S
@@ -18,362 +18,333 @@ 
    <https://www.gnu.org/licenses/>.  */
 
 #include <sysdep.h>
+#define VEC_SIZE			16
+#define PAGE_SIZE			4096
 
 	.text
-ENTRY (__memrchr)
-	movd	%esi, %xmm1
-
-	sub	$16, %RDX_LP
-	jbe	L(length_less16)
-
-	punpcklbw	%xmm1, %xmm1
-	punpcklbw	%xmm1, %xmm1
-
-	add	%RDX_LP, %RDI_LP
-	pshufd	$0, %xmm1, %xmm1
-
-	movdqu	(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-
-/* Check if there is a match.  */
-	pmovmskb	%xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches0)
-
-	sub	$64, %rdi
-	mov	%edi, %ecx
-	and	$15, %ecx
-	jz	L(loop_prolog)
-
-	add	$16, %rdi
-	add	$16, %rdx
-	and	$-16, %rdi
-	sub	%rcx, %rdx
-
-	.p2align 4
-L(loop_prolog):
-	sub	$64, %rdx
-	jbe	L(exit_loop)
-
-	movdqa	48(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb	%xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches48)
-
-	movdqa	32(%rdi), %xmm2
-	pcmpeqb	%xmm1, %xmm2
-	pmovmskb	%xmm2, %eax
-	test	%eax, %eax
-	jnz	L(matches32)
-
-	movdqa	16(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
-	pmovmskb	%xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches16)
-
-	movdqa	(%rdi), %xmm4
-	pcmpeqb	%xmm1, %xmm4
-	pmovmskb	%xmm4, %eax
-	test	%eax, %eax
-	jnz	L(matches0)
-
-	sub	$64, %rdi
-	sub	$64, %rdx
-	jbe	L(exit_loop)
-
-	movdqa	48(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb	%xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches48)
-
-	movdqa	32(%rdi), %xmm2
-	pcmpeqb	%xmm1, %xmm2
-	pmovmskb	%xmm2, %eax
-	test	%eax, %eax
-	jnz	L(matches32)
-
-	movdqa	16(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
-	pmovmskb	%xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches16)
-
-	movdqa	(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
-	pmovmskb	%xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches0)
-
-	mov	%edi, %ecx
-	and	$63, %ecx
-	jz	L(align64_loop)
-
-	add	$64, %rdi
-	add	$64, %rdx
-	and	$-64, %rdi
-	sub	%rcx, %rdx
-
-	.p2align 4
-L(align64_loop):
-	sub	$64, %rdi
-	sub	$64, %rdx
-	jbe	L(exit_loop)
-
-	movdqa	(%rdi), %xmm0
-	movdqa	16(%rdi), %xmm2
-	movdqa	32(%rdi), %xmm3
-	movdqa	48(%rdi), %xmm4
-
-	pcmpeqb	%xmm1, %xmm0
-	pcmpeqb	%xmm1, %xmm2
-	pcmpeqb	%xmm1, %xmm3
-	pcmpeqb	%xmm1, %xmm4
-
-	pmaxub	%xmm3, %xmm0
-	pmaxub	%xmm4, %xmm2
-	pmaxub	%xmm0, %xmm2
-	pmovmskb	%xmm2, %eax
-
-	test	%eax, %eax
-	jz	L(align64_loop)
-
-	pmovmskb	%xmm4, %eax
-	test	%eax, %eax
-	jnz	L(matches48)
-
-	pmovmskb	%xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches32)
-
-	movdqa	16(%rdi), %xmm2
-
-	pcmpeqb	%xmm1, %xmm2
-	pcmpeqb	(%rdi), %xmm1
-
-	pmovmskb	%xmm2, %eax
-	test	%eax, %eax
-	jnz	L(matches16)
-
-	pmovmskb	%xmm1, %eax
-	bsr	%eax, %eax
-
-	add	%rdi, %rax
+ENTRY_P2ALIGN(__memrchr, 6)
+#ifdef __ILP32__
+	/* Clear upper bits.  */
+	mov	%RDX_LP, %RDX_LP
+#endif
+	movd	%esi, %xmm0
+
+	/* Get end pointer.  */
+	leaq	(%rdx, %rdi), %rcx
+
+	punpcklbw %xmm0, %xmm0
+	punpcklwd %xmm0, %xmm0
+	pshufd	$0, %xmm0, %xmm0
+
+	/* Check if we can load 1x VEC without cross a page.  */
+	testl	$(PAGE_SIZE - VEC_SIZE), %ecx
+	jz	L(page_cross)
+
+	/* NB: This load happens regardless of whether rdx (len) is zero. Since
+	   it doesn't cross a page and the standard gurantees any pointer have
+	   at least one-valid byte this load must be safe. For the entire
+	   history of the x86 memrchr implementation this has been possible so
+	   no code "should" be relying on a zero-length check before this load.
+	   The zero-length check is moved to the page cross case because it is
+	   1) pretty cold and including it pushes the hot case len <= VEC_SIZE
+	   into 2-cache lines.  */
+	movups	-(VEC_SIZE)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	subq	$VEC_SIZE, %rdx
+	ja	L(more_1x_vec)
+L(ret_vec_x0_test):
+	/* Zero-flag set if eax (src) is zero. Destination unchanged if src is
+	   zero.  */
+	bsrl	%eax, %eax
+	jz	L(ret_0)
+	/* Check if the CHAR match is in bounds. Need to truly zero `eax` here
+	   if out of bounds.  */
+	addl	%edx, %eax
+	jl	L(zero_0)
+	/* Since we subtracted VEC_SIZE from rdx earlier we can just add to base
+	   ptr.  */
+	addq	%rdi, %rax
+L(ret_0):
 	ret
 
-	.p2align 4
-L(exit_loop):
-	add	$64, %edx
-	cmp	$32, %edx
-	jbe	L(exit_loop_32)
-
-	movdqa	48(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb	%xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches48)
-
-	movdqa	32(%rdi), %xmm2
-	pcmpeqb	%xmm1, %xmm2
-	pmovmskb	%xmm2, %eax
-	test	%eax, %eax
-	jnz	L(matches32)
-
-	movdqa	16(%rdi), %xmm3
-	pcmpeqb	%xmm1, %xmm3
-	pmovmskb	%xmm3, %eax
-	test	%eax, %eax
-	jnz	L(matches16_1)
-	cmp	$48, %edx
-	jbe	L(return_null)
-
-	pcmpeqb	(%rdi), %xmm1
-	pmovmskb	%xmm1, %eax
-	test	%eax, %eax
-	jnz	L(matches0_1)
-	xor	%eax, %eax
+	.p2align 4,, 5
+L(ret_vec_x0):
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE)(%rcx, %rax), %rax
 	ret
 
-	.p2align 4
-L(exit_loop_32):
-	movdqa	48(%rdi), %xmm0
-	pcmpeqb	%xmm1, %xmm0
-	pmovmskb	%xmm0, %eax
-	test	%eax, %eax
-	jnz	L(matches48_1)
-	cmp	$16, %edx
-	jbe	L(return_null)
-
-	pcmpeqb	32(%rdi), %xmm1
-	pmovmskb	%xmm1, %eax
-	test	%eax, %eax
-	jnz	L(matches32_1)
-	xor	%eax, %eax
+	.p2align 4,, 2
+L(zero_0):
+	xorl	%eax, %eax
 	ret
 
-	.p2align 4
-L(matches0):
-	bsr	%eax, %eax
-	add	%rdi, %rax
-	ret
-
-	.p2align 4
-L(matches16):
-	bsr	%eax, %eax
-	lea	16(%rax, %rdi), %rax
-	ret
 
-	.p2align 4
-L(matches32):
-	bsr	%eax, %eax
-	lea	32(%rax, %rdi), %rax
+	.p2align 4,, 8
+L(more_1x_vec):
+	testl	%eax, %eax
+	jnz	L(ret_vec_x0)
+
+	/* Align rcx (pointer to string).  */
+	decq	%rcx
+	andq	$-VEC_SIZE, %rcx
+
+	movq	%rcx, %rdx
+	/* NB: We could consistenyl save 1-byte in this pattern with `movaps
+	   %xmm0, %xmm1; pcmpeq IMM8(r), %xmm1; ...`. The reason against it is
+	   it adds more frontend uops (even if the moves can be eliminated) and
+	   some percentage of the time actual backend uops.  */
+	movaps	-(VEC_SIZE)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	subq	%rdi, %rdx
+	pmovmskb %xmm1, %eax
+
+	cmpq	$(VEC_SIZE * 2), %rdx
+	ja	L(more_2x_vec)
+L(last_2x_vec):
+	subl	$VEC_SIZE, %edx
+	jbe	L(ret_vec_x0_test)
+
+	testl	%eax, %eax
+	jnz	L(ret_vec_x0)
+
+	movaps	-(VEC_SIZE * 2)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	subl	$VEC_SIZE, %edx
+	bsrl	%eax, %eax
+	jz	L(ret_1)
+	addl	%edx, %eax
+	jl	L(zero_0)
+	addq	%rdi, %rax
+L(ret_1):
 	ret
 
-	.p2align 4
-L(matches48):
-	bsr	%eax, %eax
-	lea	48(%rax, %rdi), %rax
+	/* Don't align. Otherwise lose 2-byte encoding in jump to L(page_cross)
+	   causes the hot pause (length <= VEC_SIZE) to span multiple cache
+	   lines.  Naturally aligned % 16 to 8-bytes.  */
+L(page_cross):
+	/* Zero length check.  */
+	testq	%rdx, %rdx
+	jz	L(zero_0)
+
+	leaq	-1(%rcx), %r8
+	andq	$-(VEC_SIZE), %r8
+
+	movaps	(%r8), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %esi
+	/* Shift out negative alignment (because we are starting from endptr and
+	   working backwards).  */
+	negl	%ecx
+	/* 32-bit shift but VEC_SIZE=16 so need to mask the shift count
+	   explicitly.  */
+	andl	$(VEC_SIZE - 1), %ecx
+	shl	%cl, %esi
+	movzwl	%si, %eax
+	leaq	(%rdi, %rdx), %rcx
+	cmpq	%rdi, %r8
+	ja	L(more_1x_vec)
+	subl	$VEC_SIZE, %edx
+	bsrl	%eax, %eax
+	jz	L(ret_2)
+	addl	%edx, %eax
+	jl	L(zero_1)
+	addq	%rdi, %rax
+L(ret_2):
 	ret
 
-	.p2align 4
-L(matches0_1):
-	bsr	%eax, %eax
-	sub	$64, %rdx
-	add	%rax, %rdx
-	jl	L(return_null)
-	add	%rdi, %rax
+	/* Fits in aliging bytes.  */
+L(zero_1):
+	xorl	%eax, %eax
 	ret
 
-	.p2align 4
-L(matches16_1):
-	bsr	%eax, %eax
-	sub	$48, %rdx
-	add	%rax, %rdx
-	jl	L(return_null)
-	lea	16(%rdi, %rax), %rax
+	.p2align 4,, 5
+L(ret_vec_x1):
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE * 2)(%rcx, %rax), %rax
 	ret
 
-	.p2align 4
-L(matches32_1):
-	bsr	%eax, %eax
-	sub	$32, %rdx
-	add	%rax, %rdx
-	jl	L(return_null)
-	lea	32(%rdi, %rax), %rax
-	ret
+	.p2align 4,, 8
+L(more_2x_vec):
+	testl	%eax, %eax
+	jnz	L(ret_vec_x0)
 
-	.p2align 4
-L(matches48_1):
-	bsr	%eax, %eax
-	sub	$16, %rdx
-	add	%rax, %rdx
-	jl	L(return_null)
-	lea	48(%rdi, %rax), %rax
-	ret
+	movaps	-(VEC_SIZE * 2)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+	testl	%eax, %eax
+	jnz	L(ret_vec_x1)
 
-	.p2align 4
-L(return_null):
-	xor	%eax, %eax
-	ret
 
-	.p2align 4
-L(length_less16_offset0):
-	test	%edx, %edx
-	jz	L(return_null)
+	movaps	-(VEC_SIZE * 3)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
 
-	mov	%dl, %cl
-	pcmpeqb	(%rdi), %xmm1
+	subq	$(VEC_SIZE * 4), %rdx
+	ja	L(more_4x_vec)
 
-	mov	$1, %edx
-	sal	%cl, %edx
-	sub	$1, %edx
+	addl	$(VEC_SIZE), %edx
+	jle	L(ret_vec_x2_test)
 
-	pmovmskb	%xmm1, %eax
+L(last_vec):
+	testl	%eax, %eax
+	jnz	L(ret_vec_x2)
 
-	and	%edx, %eax
-	test	%eax, %eax
-	jz	L(return_null)
+	movaps	-(VEC_SIZE * 4)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
 
-	bsr	%eax, %eax
-	add	%rdi, %rax
+	subl	$(VEC_SIZE), %edx
+	bsrl	%eax, %eax
+	jz	L(ret_3)
+	addl	%edx, %eax
+	jl	L(zero_2)
+	addq	%rdi, %rax
+L(ret_3):
 	ret
 
-	.p2align 4
-L(length_less16):
-	punpcklbw	%xmm1, %xmm1
-	punpcklbw	%xmm1, %xmm1
-
-	add	$16, %edx
-
-	pshufd	$0, %xmm1, %xmm1
-
-	mov	%edi, %ecx
-	and	$15, %ecx
-	jz	L(length_less16_offset0)
-
-	mov	%cl, %dh
-	mov	%ecx, %esi
-	add	%dl, %dh
-	and	$-16, %rdi
-
-	sub	$16, %dh
-	ja	L(length_less16_part2)
-
-	pcmpeqb	(%rdi), %xmm1
-	pmovmskb	%xmm1, %eax
-
-	sar	%cl, %eax
-	mov	%dl, %cl
-
-	mov	$1, %edx
-	sal	%cl, %edx
-	sub	$1, %edx
-
-	and	%edx, %eax
-	test	%eax, %eax
-	jz	L(return_null)
-
-	bsr	%eax, %eax
-	add	%rdi, %rax
-	add	%rsi, %rax
+	.p2align 4,, 6
+L(ret_vec_x2_test):
+	bsrl	%eax, %eax
+	jz	L(zero_2)
+	addl	%edx, %eax
+	jl	L(zero_2)
+	addq	%rdi, %rax
 	ret
 
-	.p2align 4
-L(length_less16_part2):
-	movdqa	16(%rdi), %xmm2
-	pcmpeqb	%xmm1, %xmm2
-	pmovmskb	%xmm2, %eax
-
-	mov	%dh, %cl
-	mov	$1, %edx
-	sal	%cl, %edx
-	sub	$1, %edx
-
-	and	%edx, %eax
+L(zero_2):
+	xorl	%eax, %eax
+	ret
 
-	test	%eax, %eax
-	jnz	L(length_less16_part2_return)
 
-	pcmpeqb	(%rdi), %xmm1
-	pmovmskb	%xmm1, %eax
+	.p2align 4,, 5
+L(ret_vec_x2):
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE * 3)(%rcx, %rax), %rax
+	ret
 
-	mov	%esi, %ecx
-	sar	%cl, %eax
-	test	%eax, %eax
-	jz	L(return_null)
+	.p2align 4,, 5
+L(ret_vec_x3):
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax
+	ret
 
-	bsr	%eax, %eax
-	add	%rdi, %rax
-	add	%rsi, %rax
+	.p2align 4,, 8
+L(more_4x_vec):
+	testl	%eax, %eax
+	jnz	L(ret_vec_x2)
+
+	movaps	-(VEC_SIZE * 4)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	testl	%eax, %eax
+	jnz	L(ret_vec_x3)
+
+	addq	$-(VEC_SIZE * 4), %rcx
+	cmpq	$(VEC_SIZE * 4), %rdx
+	jbe	L(last_4x_vec)
+
+	/* Offset everything by 4x VEC_SIZE here to save a few bytes at the end
+	   keeping the code from spilling to the next cache line.  */
+	addq	$(VEC_SIZE * 4 - 1), %rcx
+	andq	$-(VEC_SIZE * 4), %rcx
+	leaq	(VEC_SIZE * 4)(%rdi), %rdx
+	andq	$-(VEC_SIZE * 4), %rdx
+
+	.p2align 4,, 11
+L(loop_4x_vec):
+	movaps	(VEC_SIZE * -1)(%rcx), %xmm1
+	movaps	(VEC_SIZE * -2)(%rcx), %xmm2
+	movaps	(VEC_SIZE * -3)(%rcx), %xmm3
+	movaps	(VEC_SIZE * -4)(%rcx), %xmm4
+	pcmpeqb	%xmm0, %xmm1
+	pcmpeqb	%xmm0, %xmm2
+	pcmpeqb	%xmm0, %xmm3
+	pcmpeqb	%xmm0, %xmm4
+
+	por	%xmm1, %xmm2
+	por	%xmm3, %xmm4
+	por	%xmm2, %xmm4
+
+	pmovmskb %xmm4, %esi
+	testl	%esi, %esi
+	jnz	L(loop_end)
+
+	addq	$-(VEC_SIZE * 4), %rcx
+	cmpq	%rdx, %rcx
+	jne	L(loop_4x_vec)
+
+	subl	%edi, %edx
+
+	/* Ends up being 1-byte nop.  */
+	.p2align 4,, 2
+L(last_4x_vec):
+	movaps	-(VEC_SIZE)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	cmpl	$(VEC_SIZE * 2), %edx
+	jbe	L(last_2x_vec)
+
+	testl	%eax, %eax
+	jnz	L(ret_vec_x0)
+
+
+	movaps	-(VEC_SIZE * 2)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	testl	%eax, %eax
+	jnz	L(ret_vec_end)
+
+	movaps	-(VEC_SIZE * 3)(%rcx), %xmm1
+	pcmpeqb	%xmm0, %xmm1
+	pmovmskb %xmm1, %eax
+
+	subl	$(VEC_SIZE * 3), %edx
+	ja	L(last_vec)
+	bsrl	%eax, %eax
+	jz	L(ret_4)
+	addl	%edx, %eax
+	jl	L(zero_3)
+	addq	%rdi, %rax
+L(ret_4):
 	ret
 
-	.p2align 4
-L(length_less16_part2_return):
-	bsr	%eax, %eax
-	lea	16(%rax, %rdi), %rax
+	/* Ends up being 1-byte nop.  */
+	.p2align 4,, 3
+L(loop_end):
+	pmovmskb %xmm1, %eax
+	sall	$16, %eax
+	jnz	L(ret_vec_end)
+
+	pmovmskb %xmm2, %eax
+	testl	%eax, %eax
+	jnz	L(ret_vec_end)
+
+	pmovmskb %xmm3, %eax
+	/* Combine last 2 VEC matches. If ecx (VEC3) is zero (no CHAR in VEC3)
+	   then it won't affect the result in esi (VEC4). If ecx is non-zero
+	   then CHAR in VEC3 and bsrq will use that position.  */
+	sall	$16, %eax
+	orl	%esi, %eax
+	bsrl	%eax, %eax
+	leaq	-(VEC_SIZE * 4)(%rcx, %rax), %rax
 	ret
 
-END (__memrchr)
+L(ret_vec_end):
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE * -2)(%rax, %rcx), %rax
+	ret
+	/* Use in L(last_4x_vec). In the same cache line. This is just a spare
+	   aligning bytes.  */
+L(zero_3):
+	xorl	%eax, %eax
+	ret
+	/* 2-bytes from next cache line.  */
+END(__memrchr)
 weak_alias (__memrchr, memrchr)