diff mbox series

[v1,4/5] x86: Optimize {str|wcs}rchr-avx2

Message ID 20220421031410.2142238-4-goldstein.w.n@gmail.com
State New
Headers show
Series [v1,1/5] benchtests: Improve bench-strrchr | expand

Commit Message

Noah Goldstein April 21, 2022, 3:14 a.m. UTC
The new code unrolls the main loop slightly without adding too much
overhead and minimizes the comparisons for the search CHAR.

Geometric Mean of all benchmarks New / Old: 0.832
See email for all results.

Full xcheck passes on x86_64 with and without multiarch enabled.
---
Results For: strrchr

Geometric Mean of N=30 runs.

Geometric Mean of all benchmarks New / Old: 0.832
Benchmarks performance on Tigerlake:
https://ark.intel.com/content/www/us/en/ark/products/208921/intel-core-i71165g7-processor-12m-cache-up-to-4-70-ghz-with-ipu.html

 len, align,  pos, seek, max_char, freq, New Time / Old Time
2048,     0,   32,    0,      127,    1,               0.673
2048,     1,   32,    0,      127,    1,                0.68
2048,     0,   64,    0,      127,    1,               0.566
2048,     2,   64,    0,      127,    1,               0.574
2048,     0,  128,    0,      127,    1,               0.976
2048,     3,  128,    0,      127,    1,               0.967
2048,     0,  256,    0,      127,    1,               0.931
2048,     4,  256,    0,      127,    1,               0.921
2048,     0,  512,    0,      127,    1,               0.792
2048,     5,  512,    0,      127,    1,                0.78
2048,     0, 1024,    0,      127,    1,               0.733
2048,     6, 1024,    0,      127,    1,               0.729
2048,     0, 2048,    0,      127,    1,               0.795
2048,     7, 2048,    0,      127,    1,               0.805
2048,     0, 4096,    0,      127,    1,               0.803
2048,     8, 4096,    0,      127,    1,               0.794
 256,     1,   64,    0,      127,    1,               0.584
 256,    15,   64,    0,      127,    1,               0.587
 256,     2,   64,    0,      127,    1,               0.586
 256,    30,   64,    0,      127,    1,               0.592
 256,     3,   64,    0,      127,    1,               0.586
 256,    45,   64,    0,      127,    1,               0.505
 256,     4,   64,    0,      127,    1,                0.59
 256,    60,   64,    0,      127,    1,               0.501
 256,     5,   64,    0,      127,    1,               0.595
 256,    75,   64,    0,      127,    1,               0.588
 256,     6,   64,    0,      127,    1,               0.593
 256,    90,   64,    0,      127,    1,               0.594
 256,     7,   64,    0,      127,    1,               0.596
 256,   105,   64,    0,      127,    1,               0.506
   1,     0,    0,    0,      127,    1,               0.872
   2,     0,    1,    0,      127,    1,               0.861
   3,     0,    2,    0,      127,    1,               0.862
   4,     0,    3,    0,      127,    1,               0.884
   5,     0,    4,    0,      127,    1,               0.869
   6,     0,    5,    0,      127,    1,               0.861
   7,     0,    6,    0,      127,    1,               0.865
   8,     0,    7,    0,      127,    1,               0.884
   9,     0,    8,    0,      127,    1,               0.862
  10,     0,    9,    0,      127,    1,               0.889
  11,     0,   10,    0,      127,    1,                 0.9
  12,     0,   11,    0,      127,    1,               0.897
  13,     0,   12,    0,      127,    1,               0.909
  14,     0,   13,    0,      127,    1,               0.885
  15,     0,   14,    0,      127,    1,               0.929
  16,     0,   15,    0,      127,    1,               0.871
  17,     0,   16,    0,      127,    1,               0.875
  18,     0,   17,    0,      127,    1,               0.878
  19,     0,   18,    0,      127,    1,               0.889
  20,     0,   19,    0,      127,    1,                0.89
  21,     0,   20,    0,      127,    1,               0.901
  22,     0,   21,    0,      127,    1,                0.91
  23,     0,   22,    0,      127,    1,               0.912
  24,     0,   23,    0,      127,    1,               0.907
  25,     0,   24,    0,      127,    1,               0.947
  26,     0,   25,    0,      127,    1,               0.904
  27,     0,   26,    0,      127,    1,               0.921
  28,     0,   27,    0,      127,    1,               0.899
  29,     0,   28,    0,      127,    1,               0.923
  30,     0,   29,    0,      127,    1,               0.918
  31,     0,   30,    0,      127,    1,               0.943
  32,     0,   31,    0,      127,    1,               0.914
2048,     0,   32,   23,      127,    1,               0.815
2048,     1,   32,   23,      127,    1,               0.829
2048,     0,   64,   23,      127,    1,               0.884
2048,     2,   64,   23,      127,    1,               0.882
2048,     0,  128,   23,      127,    1,               0.884
2048,     3,  128,   23,      127,    1,               0.851
2048,     0,  256,   23,      127,    1,               0.843
2048,     4,  256,   23,      127,    1,               0.867
2048,     0,  512,   23,      127,    1,               0.746
2048,     5,  512,   23,      127,    1,               0.863
2048,     0, 1024,   23,      127,    1,               0.662
2048,     6, 1024,   23,      127,    1,               0.683
2048,     0, 2048,   23,      127,    1,               0.852
2048,     7, 2048,   23,      127,    1,               0.837
2048,     0, 4096,   23,      127,    1,               0.837
2048,     8, 4096,   23,      127,    1,               0.829
 256,     1,   64,   23,      127,    1,               0.934
 256,    15,   64,   23,      127,    1,               0.936
 256,     2,   64,   23,      127,    1,               0.931
 256,    30,   64,   23,      127,    1,               0.938
 256,     3,   64,   23,      127,    1,               0.927
 256,    45,   64,   23,      127,    1,               0.863
 256,     4,   64,   23,      127,    1,               0.939
 256,    60,   64,   23,      127,    1,               0.871
 256,     5,   64,   23,      127,    1,                0.94
 256,    75,   64,   23,      127,    1,               0.933
 256,     6,   64,   23,      127,    1,               0.915
 256,    90,   64,   23,      127,    1,               0.934
 256,     7,   64,   23,      127,    1,               0.938
 256,   105,   64,   23,      127,    1,               0.871
   1,     0,    0,   23,      127,    1,               0.865
   2,     0,    1,   23,      127,    1,                0.87
   3,     0,    2,   23,      127,    1,               0.882
   4,     0,    3,   23,      127,    1,               0.901
   5,     0,    4,   23,      127,    1,               0.879
   6,     0,    5,   23,      127,    1,               0.934
   7,     0,    6,   23,      127,    1,               0.874
   8,     0,    7,   23,      127,    1,               0.895
   9,     0,    8,   23,      127,    1,               0.873
  10,     0,    9,   23,      127,    1,               0.861
  11,     0,   10,   23,      127,    1,               0.865
  12,     0,   11,   23,      127,    1,               0.875
  13,     0,   12,   23,      127,    1,               0.878
  14,     0,   13,   23,      127,    1,                0.86
  15,     0,   14,   23,      127,    1,               0.889
  16,     0,   15,   23,      127,    1,               0.875
  17,     0,   16,   23,      127,    1,               0.911
  18,     0,   17,   23,      127,    1,               0.891
  19,     0,   18,   23,      127,    1,               0.921
  20,     0,   19,   23,      127,    1,               0.898
  21,     0,   20,   23,      127,    1,               0.895
  22,     0,   21,   23,      127,    1,               0.906
  23,     0,   22,   23,      127,    1,               0.911
  24,     0,   23,   23,      127,    1,               0.877
  25,     0,   24,   23,      127,    1,                 0.9
  26,     0,   25,   23,      127,    1,               0.911
  27,     0,   26,   23,      127,    1,               0.926
  28,     0,   27,   23,      127,    1,               0.918
  29,     0,   28,   23,      127,    1,               0.952
  30,     0,   29,   23,      127,    1,               0.943
  31,     0,   30,   23,      127,    1,               0.934
  32,     0,   31,   23,      127,    1,                 0.8
2048,     0,   32,   23,      127,    2,               0.872
2048,     1,   32,   23,      127,    2,               0.819
2048,     0,   64,   23,      127,    2,               0.815
2048,     2,   64,   23,      127,    2,               0.805
2048,     0,  128,   23,      127,    2,               0.884
2048,     3,  128,   23,      127,    2,               0.852
2048,     0,  256,   23,      127,    2,               0.873
2048,     4,  256,   23,      127,    2,               0.871
2048,     0,  512,   23,      127,    2,               0.654
2048,     5,  512,   23,      127,    2,               0.762
2048,     0, 1024,   23,      127,    2,               0.646
2048,     6, 1024,   23,      127,    2,               0.665
2048,     0, 2048,   23,      127,    2,               0.678
2048,     7, 2048,   23,      127,    2,               0.675
2048,     0, 4096,   23,      127,    2,               0.849
2048,     8, 4096,   23,      127,    2,               0.835
 256,     1,   64,   23,      127,    2,               0.917
 256,    15,   64,   23,      127,    2,               0.915
 256,     2,   64,   23,      127,    2,               0.911
 256,    30,   64,   23,      127,    2,               0.907
 256,     3,   64,   23,      127,    2,                 0.9
 256,    45,   64,   23,      127,    2,               0.816
 256,     4,   64,   23,      127,    2,               0.912
 256,    60,   64,   23,      127,    2,                0.81
 256,     5,   64,   23,      127,    2,               0.904
 256,    75,   64,   23,      127,    2,               0.911
 256,     6,   64,   23,      127,    2,               0.898
 256,    90,   64,   23,      127,    2,               0.912
 256,     7,   64,   23,      127,    2,               0.909
 256,   105,   64,   23,      127,    2,                0.81
   1,     0,    0,   23,      127,    2,               0.858
   2,     0,    1,   23,      127,    2,                0.89
   3,     0,    2,   23,      127,    2,               0.877
   4,     0,    3,   23,      127,    2,               0.863
   5,     0,    4,   23,      127,    2,               0.863
   6,     0,    5,   23,      127,    2,               0.889
   7,     0,    6,   23,      127,    2,               0.898
   8,     0,    7,   23,      127,    2,               0.885
   9,     0,    8,   23,      127,    2,               0.863
  10,     0,    9,   23,      127,    2,               0.902
  11,     0,   10,   23,      127,    2,               0.865
  12,     0,   11,   23,      127,    2,               0.864
  13,     0,   12,   23,      127,    2,                0.87
  14,     0,   13,   23,      127,    2,               0.862
  15,     0,   14,   23,      127,    2,               0.861
  16,     0,   15,   23,      127,    2,               0.859
  17,     0,   16,   23,      127,    2,                0.87
  18,     0,   17,   23,      127,    2,               0.892
  19,     0,   18,   23,      127,    2,               0.874
  20,     0,   19,   23,      127,    2,               0.866
  21,     0,   20,   23,      127,    2,               0.877
  22,     0,   21,   23,      127,    2,               0.868
  23,     0,   22,   23,      127,    2,               0.884
  24,     0,   23,   23,      127,    2,               0.881
  25,     0,   24,   23,      127,    2,               0.872
  26,     0,   25,   23,      127,    2,               0.866
  27,     0,   26,   23,      127,    2,               0.881
  28,     0,   27,   23,      127,    2,                0.93
  29,     0,   28,   23,      127,    2,               0.886
  30,     0,   29,   23,      127,    2,               0.869
  31,     0,   30,   23,      127,    2,               0.869
  32,     0,   31,   23,      127,    2,               0.667
2048,     0,   32,   23,      127,    4,               0.858
2048,     1,   32,   23,      127,    4,               0.858
2048,     0,   64,   23,      127,    4,               0.838
2048,     2,   64,   23,      127,    4,               0.834
2048,     0,  128,   23,      127,    4,                0.85
2048,     3,  128,   23,      127,    4,               0.762
2048,     0,  256,   23,      127,    4,               0.874
2048,     4,  256,   23,      127,    4,               0.796
2048,     0,  512,   23,      127,    4,               0.691
2048,     5,  512,   23,      127,    4,               0.755
2048,     0, 1024,   23,      127,    4,               0.676
2048,     6, 1024,   23,      127,    4,               0.661
2048,     0, 2048,   23,      127,    4,               0.678
2048,     7, 2048,   23,      127,    4,               0.678
2048,     0, 4096,   23,      127,    4,               0.676
2048,     8, 4096,   23,      127,    4,               0.677
 256,     1,   64,   23,      127,    4,               0.875
 256,    15,   64,   23,      127,    4,               0.877
 256,     2,   64,   23,      127,    4,               0.875
 256,    30,   64,   23,      127,    4,               0.875
 256,     3,   64,   23,      127,    4,               0.878
 256,    45,   64,   23,      127,    4,               0.829
 256,     4,   64,   23,      127,    4,               0.876
 256,    60,   64,   23,      127,    4,               0.807
 256,     5,   64,   23,      127,    4,               0.874
 256,    75,   64,   23,      127,    4,               0.872
 256,     6,   64,   23,      127,    4,               0.874
 256,    90,   64,   23,      127,    4,               0.874
 256,     7,   64,   23,      127,    4,               0.873
 256,   105,   64,   23,      127,    4,               0.826
   1,     0,    0,   23,      127,    4,               0.863
   2,     0,    1,   23,      127,    4,               0.861
   3,     0,    2,   23,      127,    4,               0.863
   4,     0,    3,   23,      127,    4,               0.867
   5,     0,    4,   23,      127,    4,               0.866
   6,     0,    5,   23,      127,    4,               0.873
   7,     0,    6,   23,      127,    4,               0.873
   8,     0,    7,   23,      127,    4,               0.866
   9,     0,    8,   23,      127,    4,               0.861
  10,     0,    9,   23,      127,    4,               0.861
  11,     0,   10,   23,      127,    4,               0.857
  12,     0,   11,   23,      127,    4,               0.864
  13,     0,   12,   23,      127,    4,                0.86
  14,     0,   13,   23,      127,    4,               0.859
  15,     0,   14,   23,      127,    4,               0.854
  16,     0,   15,   23,      127,    4,               0.857
  17,     0,   16,   23,      127,    4,               0.881
  18,     0,   17,   23,      127,    4,               0.863
  19,     0,   18,   23,      127,    4,                0.86
  20,     0,   19,   23,      127,    4,               0.906
  21,     0,   20,   23,      127,    4,               0.924
  22,     0,   21,   23,      127,    4,               0.885
  23,     0,   22,   23,      127,    4,               0.861
  24,     0,   23,   23,      127,    4,               0.907
  25,     0,   24,   23,      127,    4,               0.909
  26,     0,   25,   23,      127,    4,               0.863
  27,     0,   26,   23,      127,    4,               0.862
  28,     0,   27,   23,      127,    4,               0.887
  29,     0,   28,   23,      127,    4,               0.879
  30,     0,   29,   23,      127,    4,               0.932
  31,     0,   30,   23,      127,    4,               0.895
  32,     0,   31,   23,      127,    4,               0.666
2048,     0,   32,   23,      127,    8,               0.865
2048,     1,   32,   23,      127,    8,               0.892
2048,     0,   64,   23,      127,    8,                0.85
2048,     2,   64,   23,      127,    8,               0.834
2048,     0,  128,   23,      127,    8,               0.823
2048,     3,  128,   23,      127,    8,               0.809
2048,     0,  256,   23,      127,    8,                0.84
2048,     4,  256,   23,      127,    8,               0.738
2048,     0,  512,   23,      127,    8,               0.656
2048,     5,  512,   23,      127,    8,               0.644
2048,     0, 1024,   23,      127,    8,               0.705
2048,     6, 1024,   23,      127,    8,               0.708
2048,     0, 2048,   23,      127,    8,               0.701
2048,     7, 2048,   23,      127,    8,                 0.7
2048,     0, 4096,   23,      127,    8,                0.68
2048,     8, 4096,   23,      127,    8,               0.678
 256,     1,   64,   23,      127,    8,               0.881
 256,    15,   64,   23,      127,    8,               0.879
 256,     2,   64,   23,      127,    8,               0.878
 256,    30,   64,   23,      127,    8,               0.877
 256,     3,   64,   23,      127,    8,                0.88
 256,    45,   64,   23,      127,    8,               0.829
 256,     4,   64,   23,      127,    8,               0.883
 256,    60,   64,   23,      127,    8,               0.808
 256,     5,   64,   23,      127,    8,               0.875
 256,    75,   64,   23,      127,    8,               0.877
 256,     6,   64,   23,      127,    8,               0.874
 256,    90,   64,   23,      127,    8,               0.874
 256,     7,   64,   23,      127,    8,               0.874
 256,   105,   64,   23,      127,    8,                0.83
   1,     0,    0,   23,      127,    8,               0.862
   2,     0,    1,   23,      127,    8,               0.865
   3,     0,    2,   23,      127,    8,               0.866
   4,     0,    3,   23,      127,    8,               0.863
   5,     0,    4,   23,      127,    8,               0.874
   6,     0,    5,   23,      127,    8,                0.87
   7,     0,    6,   23,      127,    8,                0.87
   8,     0,    7,   23,      127,    8,               0.864
   9,     0,    8,   23,      127,    8,                0.87
  10,     0,    9,   23,      127,    8,               0.861
  11,     0,   10,   23,      127,    8,               0.862
  12,     0,   11,   23,      127,    8,                0.87
  13,     0,   12,   23,      127,    8,               0.858
  14,     0,   13,   23,      127,    8,                0.86
  15,     0,   14,   23,      127,    8,               0.863
  16,     0,   15,   23,      127,    8,               0.866
  17,     0,   16,   23,      127,    8,                0.86
  18,     0,   17,   23,      127,    8,               0.887
  19,     0,   18,   23,      127,    8,               0.858
  20,     0,   19,   23,      127,    8,               0.891
  21,     0,   20,   23,      127,    8,               0.874
  22,     0,   21,   23,      127,    8,               0.891
  23,     0,   22,   23,      127,    8,               0.873
  24,     0,   23,   23,      127,    8,               0.895
  25,     0,   24,   23,      127,    8,               0.884
  26,     0,   25,   23,      127,    8,               0.878
  27,     0,   26,   23,      127,    8,               0.878
  28,     0,   27,   23,      127,    8,               0.891
  29,     0,   28,   23,      127,    8,                0.91
  30,     0,   29,   23,      127,    8,               0.881
  31,     0,   30,   23,      127,    8,               0.917
  32,     0,   31,   23,      127,    8,               0.667
2048,     0,   32,   23,      127,   16,                0.86
2048,     1,   32,   23,      127,   16,               0.847
2048,     0,   64,   23,      127,   16,               0.846
2048,     2,   64,   23,      127,   16,               0.852
2048,     0,  128,   23,      127,   16,                0.82
2048,     3,  128,   23,      127,   16,               0.751
2048,     0,  256,   23,      127,   16,               0.788
2048,     4,  256,   23,      127,   16,               0.712
2048,     0,  512,   23,      127,   16,               0.524
2048,     5,  512,   23,      127,   16,               0.517
2048,     0, 1024,   23,      127,   16,               0.583
2048,     6, 1024,   23,      127,   16,               0.682
2048,     0, 2048,   23,      127,   16,                0.77
2048,     7, 2048,   23,      127,   16,               0.659
2048,     0, 4096,   23,      127,   16,                 0.7
2048,     8, 4096,   23,      127,   16,                 0.7
 256,     1,   64,   23,      127,   16,               0.798
 256,    15,   64,   23,      127,   16,               0.873
 256,     2,   64,   23,      127,   16,               0.875
 256,    30,   64,   23,      127,   16,               0.877
 256,     3,   64,   23,      127,   16,               0.875
 256,    45,   64,   23,      127,   16,               0.834
 256,     4,   64,   23,      127,   16,               0.873
 256,    60,   64,   23,      127,   16,               0.809
 256,     5,   64,   23,      127,   16,               0.879
 256,    75,   64,   23,      127,   16,               0.884
 256,     6,   64,   23,      127,   16,               0.874
 256,    90,   64,   23,      127,   16,               0.876
 256,     7,   64,   23,      127,   16,               0.876
 256,   105,   64,   23,      127,   16,               0.827
   1,     0,    0,   23,      127,   16,               0.859
   2,     0,    1,   23,      127,   16,               0.864
   3,     0,    2,   23,      127,   16,               0.871
   4,     0,    3,   23,      127,   16,               0.869
   5,     0,    4,   23,      127,   16,               0.881
   6,     0,    5,   23,      127,   16,               0.869
   7,     0,    6,   23,      127,   16,               0.867
   8,     0,    7,   23,      127,   16,               0.877
   9,     0,    8,   23,      127,   16,               0.862
  10,     0,    9,   23,      127,   16,               0.861
  11,     0,   10,   23,      127,   16,               0.859
  12,     0,   11,   23,      127,   16,               0.858
  13,     0,   12,   23,      127,   16,               0.867
  14,     0,   13,   23,      127,   16,               0.857
  15,     0,   14,   23,      127,   16,               0.858
  16,     0,   15,   23,      127,   16,               0.857
  17,     0,   16,   23,      127,   16,               0.858
  18,     0,   17,   23,      127,   16,               0.867
  19,     0,   18,   23,      127,   16,               0.875
  20,     0,   19,   23,      127,   16,               0.868
  21,     0,   20,   23,      127,   16,               0.861
  22,     0,   21,   23,      127,   16,               0.868
  23,     0,   22,   23,      127,   16,               0.866
  24,     0,   23,   23,      127,   16,               0.858
  25,     0,   24,   23,      127,   16,               0.859
  26,     0,   25,   23,      127,   16,               0.857
  27,     0,   26,   23,      127,   16,               0.866
  28,     0,   27,   23,      127,   16,               0.875
  29,     0,   28,   23,      127,   16,               0.896
  30,     0,   29,   23,      127,   16,               0.889
  31,     0,   30,   23,      127,   16,               0.903
  32,     0,   31,   23,      127,   16,               0.667

 sysdeps/x86_64/multiarch/strrchr-avx2.S | 415 +++++++++++++++---------
 1 file changed, 258 insertions(+), 157 deletions(-)
diff mbox series

Patch

diff --git a/sysdeps/x86_64/multiarch/strrchr-avx2.S b/sysdeps/x86_64/multiarch/strrchr-avx2.S
index 1df2adfad0..9d1e45defc 100644
--- a/sysdeps/x86_64/multiarch/strrchr-avx2.S
+++ b/sysdeps/x86_64/multiarch/strrchr-avx2.S
@@ -27,9 +27,13 @@ 
 # ifdef USE_AS_WCSRCHR
 #  define VPBROADCAST	vpbroadcastd
 #  define VPCMPEQ	vpcmpeqd
+#  define VPMIN	vpminud
+#  define CHAR_SIZE	4
 # else
 #  define VPBROADCAST	vpbroadcastb
 #  define VPCMPEQ	vpcmpeqb
+#  define VPMIN	vpminub
+#  define CHAR_SIZE	1
 # endif
 
 # ifndef VZEROUPPER
@@ -41,196 +45,293 @@ 
 # endif
 
 # define VEC_SIZE	32
+# define PAGE_SIZE	4096
 
-	.section SECTION(.text),"ax",@progbits
-ENTRY (STRRCHR)
-	movd	%esi, %xmm4
-	movl	%edi, %ecx
+	.section SECTION(.text), "ax", @progbits
+ENTRY(STRRCHR)
+	movd	%esi, %xmm7
+	movl	%edi, %eax
 	/* Broadcast CHAR to YMM4.  */
-	VPBROADCAST %xmm4, %ymm4
+	VPBROADCAST %xmm7, %ymm7
 	vpxor	%xmm0, %xmm0, %xmm0
 
-	/* Check if we may cross page boundary with one vector load.  */
-	andl	$(2 * VEC_SIZE - 1), %ecx
-	cmpl	$VEC_SIZE, %ecx
-	ja	L(cros_page_boundary)
+	/* Shift here instead of `andl` to save code size (saves a fetch
+	   block).  */
+	sall	$20, %eax
+	cmpl	$((PAGE_SIZE - VEC_SIZE) << 20), %eax
+	ja	L(cross_page)
 
+L(page_cross_continue):
 	vmovdqu	(%rdi), %ymm1
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
-	vpmovmskb %ymm2, %ecx
-	vpmovmskb %ymm3, %eax
-	addq	$VEC_SIZE, %rdi
+	/* Check end of string match.  */
+	VPCMPEQ	%ymm1, %ymm0, %ymm6
+	vpmovmskb %ymm6, %ecx
+	testl	%ecx, %ecx
+	jz	L(aligned_more)
+
+	/* Only check match with search CHAR if needed.  */
+	VPCMPEQ	%ymm1, %ymm7, %ymm1
+	vpmovmskb %ymm1, %eax
+	/* Check if match before first zero.  */
+	blsmskl	%ecx, %ecx
+	andl	%ecx, %eax
+	jz	L(ret0)
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+	/* We are off by 3 for wcsrchr if search CHAR is non-zero. If
+	   search CHAR is zero we are correct. Either way `andq
+	   -CHAR_SIZE, %rax` gets the correct result.  */
+# ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+# endif
+L(ret0):
+L(return_vzeroupper):
+	ZERO_UPPER_VEC_REGISTERS_RETURN
+
+	/* Returns for first vec x1/x2 have hard coded backward search
+	   path for earlier matches.  */
+	.p2align 4,, 10
+L(first_vec_x1):
+	VPCMPEQ	%ymm2, %ymm7, %ymm6
+	vpmovmskb %ymm6, %eax
+	blsmskl	%ecx, %ecx
+	andl	%ecx, %eax
+	jnz	L(first_vec_x1_return)
+
+	.p2align 4,, 4
+L(first_vec_x0_test):
+	VPCMPEQ	%ymm1, %ymm7, %ymm6
+	vpmovmskb %ymm6, %eax
+	testl	%eax, %eax
+	jz	L(ret1)
+	bsrl	%eax, %eax
+	addq	%r8, %rax
+# ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+# endif
+L(ret1):
+	VZEROUPPER_RETURN
 
+	.p2align 4,, 10
+L(first_vec_x0_x1_test):
+	VPCMPEQ	%ymm2, %ymm7, %ymm6
+	vpmovmskb %ymm6, %eax
 	testl	%eax, %eax
-	jnz	L(first_vec)
+	jz	L(first_vec_x0_test)
+	.p2align 4,, 4
+L(first_vec_x1_return):
+	bsrl	%eax, %eax
+	leaq	1(%rdi, %rax), %rax
+# ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+# endif
+	VZEROUPPER_RETURN
 
-	testl	%ecx, %ecx
-	jnz	L(return_null)
 
-	andq	$-VEC_SIZE, %rdi
-	xorl	%edx, %edx
-	jmp	L(aligned_loop)
+	.p2align 4,, 10
+L(first_vec_x2):
+	VPCMPEQ	%ymm3, %ymm7, %ymm6
+	vpmovmskb %ymm6, %eax
+	blsmskl	%ecx, %ecx
+	andl	%ecx, %eax
+	jz	L(first_vec_x0_x1_test)
+	bsrl	%eax, %eax
+	leaq	(VEC_SIZE + 1)(%rdi, %rax), %rax
+# ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+# endif
+	VZEROUPPER_RETURN
+
 
 	.p2align 4
-L(first_vec):
-	/* Check if there is a nul CHAR.  */
+L(aligned_more):
+	/* Save original pointer if match was in VEC 0.  */
+	movq	%rdi, %r8
+
+	/* Align src.  */
+	orq	$(VEC_SIZE - 1), %rdi
+	vmovdqu	1(%rdi), %ymm2
+	VPCMPEQ	%ymm2, %ymm0, %ymm6
+	vpmovmskb %ymm6, %ecx
 	testl	%ecx, %ecx
-	jnz	L(char_and_nul_in_first_vec)
+	jnz	L(first_vec_x1)
 
-	/* Remember the match and keep searching.  */
-	movl	%eax, %edx
-	movq	%rdi, %rsi
-	andq	$-VEC_SIZE, %rdi
-	jmp	L(aligned_loop)
+	vmovdqu	(VEC_SIZE + 1)(%rdi), %ymm3
+	VPCMPEQ	%ymm3, %ymm0, %ymm6
+	vpmovmskb %ymm6, %ecx
+	testl	%ecx, %ecx
+	jnz	L(first_vec_x2)
 
+	/* Save pointer again before realigning.  */
+	movq	%rdi, %rsi
+	addq	$(VEC_SIZE + 1), %rdi
+	andq	$-(VEC_SIZE * 2), %rdi
 	.p2align 4
-L(cros_page_boundary):
-	andl	$(VEC_SIZE - 1), %ecx
-	andq	$-VEC_SIZE, %rdi
-	vmovdqa	(%rdi), %ymm1
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
-	vpmovmskb %ymm2, %edx
-	vpmovmskb %ymm3, %eax
-	shrl	%cl, %edx
-	shrl	%cl, %eax
-	addq	$VEC_SIZE, %rdi
-
-	/* Check if there is a CHAR.  */
+L(first_aligned_loop):
+	/* Do 2x VEC at a time. Any more and the cost of finding the
+	   match outweights loop benefit.  */
+	vmovdqa	(VEC_SIZE * 0)(%rdi), %ymm4
+	vmovdqa	(VEC_SIZE * 1)(%rdi), %ymm5
+
+	VPCMPEQ	%ymm4, %ymm7, %ymm6
+	VPMIN	%ymm4, %ymm5, %ymm8
+	VPCMPEQ	%ymm5, %ymm7, %ymm10
+	vpor	%ymm6, %ymm10, %ymm5
+	VPCMPEQ	%ymm8, %ymm0, %ymm8
+	vpor	%ymm5, %ymm8, %ymm9
+
+	vpmovmskb %ymm9, %eax
+	addq	$(VEC_SIZE * 2), %rdi
+	/* No zero or search CHAR.  */
 	testl	%eax, %eax
-	jnz	L(found_char)
-
-	testl	%edx, %edx
-	jnz	L(return_null)
+	jz	L(first_aligned_loop)
 
-	jmp	L(aligned_loop)
-
-	.p2align 4
-L(found_char):
-	testl	%edx, %edx
-	jnz	L(char_and_nul)
+	/* If no zero CHAR then go to second loop (this allows us to
+	   throw away all prior work).  */
+	vpmovmskb %ymm8, %ecx
+	testl	%ecx, %ecx
+	jz	L(second_aligned_loop_prep)
 
-	/* Remember the match and keep searching.  */
-	movl	%eax, %edx
-	leaq	(%rdi, %rcx), %rsi
+	/* Search char could be zero so we need to get the true match.
+	 */
+	vpmovmskb %ymm5, %eax
+	testl	%eax, %eax
+	jnz	L(first_aligned_loop_return)
 
-	.p2align 4
-L(aligned_loop):
-	vmovdqa	(%rdi), %ymm1
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
-	addq	$VEC_SIZE, %rdi
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
-	vpmovmskb %ymm2, %ecx
-	vpmovmskb %ymm3, %eax
-	orl	%eax, %ecx
-	jnz	L(char_nor_null)
-
-	vmovdqa	(%rdi), %ymm1
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
-	add	$VEC_SIZE, %rdi
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
-	vpmovmskb %ymm2, %ecx
+	.p2align 4,, 4
+L(first_vec_x1_or_x2):
+	VPCMPEQ	%ymm3, %ymm7, %ymm3
+	VPCMPEQ	%ymm2, %ymm7, %ymm2
 	vpmovmskb %ymm3, %eax
-	orl	%eax, %ecx
-	jnz	L(char_nor_null)
-
-	vmovdqa	(%rdi), %ymm1
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
-	addq	$VEC_SIZE, %rdi
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
-	vpmovmskb %ymm2, %ecx
-	vpmovmskb %ymm3, %eax
-	orl	%eax, %ecx
-	jnz	L(char_nor_null)
-
-	vmovdqa	(%rdi), %ymm1
-	VPCMPEQ	%ymm1, %ymm0, %ymm2
-	addq	$VEC_SIZE, %rdi
-	VPCMPEQ	%ymm1, %ymm4, %ymm3
-	vpmovmskb %ymm2, %ecx
-	vpmovmskb %ymm3, %eax
-	orl	%eax, %ecx
-	jz	L(aligned_loop)
-
-	.p2align 4
-L(char_nor_null):
-	/* Find a CHAR or a nul CHAR in a loop.  */
-	testl	%eax, %eax
-	jnz	L(match)
-L(return_value):
-	testl	%edx, %edx
-	jz	L(return_null)
-	movl	%edx, %eax
-	movq	%rsi, %rdi
+	vpmovmskb %ymm2, %edx
+	/* Use add for macro-fusion.  */
+	addq	%rax, %rdx
+	jz	L(first_vec_x0_test)
+	/* NB: We could move this shift to before the branch and save a
+	   bit of code size / performance on the fall through. The
+	   branch leads to the null case which generally seems hotter
+	   than char in first 3x VEC.  */
+	salq	$32, %rax
+	addq	%rdx, %rax
+	bsrq	%rax, %rax
+	leaq	1(%rsi, %rax), %rax
+# ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+# endif
+	VZEROUPPER_RETURN
 
+	.p2align 4,, 8
+L(first_aligned_loop_return):
+	VPCMPEQ	%ymm4, %ymm0, %ymm4
+	vpmovmskb %ymm4, %edx
+	salq	$32, %rcx
+	orq	%rdx, %rcx
+
+	vpmovmskb %ymm10, %eax
+	vpmovmskb %ymm6, %edx
+	salq	$32, %rax
+	orq	%rdx, %rax
+	blsmskq	%rcx, %rcx
+	andq	%rcx, %rax
+	jz	L(first_vec_x1_or_x2)
+
+	bsrq	%rax, %rax
+	leaq	-(VEC_SIZE * 2)(%rdi, %rax), %rax
 # ifdef USE_AS_WCSRCHR
-	/* Keep the first bit for each matching CHAR for bsr.  */
-	andl	$0x11111111, %eax
+	andq	$-CHAR_SIZE, %rax
 # endif
-	bsrl	%eax, %eax
-	leaq	-VEC_SIZE(%rdi, %rax), %rax
-L(return_vzeroupper):
-	ZERO_UPPER_VEC_REGISTERS_RETURN
+	VZEROUPPER_RETURN
 
+	/* Search char cannot be zero.  */
 	.p2align 4
-L(match):
-	/* Find a CHAR.  Check if there is a nul CHAR.  */
-	vpmovmskb %ymm2, %ecx
-	testl	%ecx, %ecx
-	jnz	L(find_nul)
-
-	/* Remember the match and keep searching.  */
-	movl	%eax, %edx
+L(second_aligned_loop_set_furthest_match):
+	/* Save VEC and pointer from most recent match.  */
+L(second_aligned_loop_prep):
 	movq	%rdi, %rsi
-	jmp	L(aligned_loop)
+	vmovdqu	%ymm6, %ymm2
+	vmovdqu	%ymm10, %ymm3
 
 	.p2align 4
-L(find_nul):
-# ifdef USE_AS_WCSRCHR
-	/* Keep the first bit for each matching CHAR for bsr.  */
-	andl	$0x11111111, %ecx
-	andl	$0x11111111, %eax
-# endif
-	/* Mask out any matching bits after the nul CHAR.  */
-	movl	%ecx, %r8d
-	subl	$1, %r8d
-	xorl	%ecx, %r8d
-	andl	%r8d, %eax
+L(second_aligned_loop):
+	/* Search 2x at at time.  */
+	vmovdqa	(VEC_SIZE * 0)(%rdi), %ymm4
+	vmovdqa	(VEC_SIZE * 1)(%rdi), %ymm5
+
+	VPCMPEQ	%ymm4, %ymm7, %ymm6
+	VPMIN	%ymm4, %ymm5, %ymm1
+	VPCMPEQ	%ymm5, %ymm7, %ymm10
+	vpor	%ymm6, %ymm10, %ymm5
+	VPCMPEQ	%ymm1, %ymm0, %ymm1
+	vpor	%ymm5, %ymm1, %ymm9
+
+	vpmovmskb %ymm9, %eax
+	addq	$(VEC_SIZE * 2), %rdi
 	testl	%eax, %eax
-	/* If there is no CHAR here, return the remembered one.  */
-	jz	L(return_value)
-	bsrl	%eax, %eax
-	leaq	-VEC_SIZE(%rdi, %rax), %rax
-	VZEROUPPER_RETURN
-
-	.p2align 4
-L(char_and_nul):
-	/* Find both a CHAR and a nul CHAR.  */
-	addq	%rcx, %rdi
-	movl	%edx, %ecx
-L(char_and_nul_in_first_vec):
-# ifdef USE_AS_WCSRCHR
-	/* Keep the first bit for each matching CHAR for bsr.  */
-	andl	$0x11111111, %ecx
-	andl	$0x11111111, %eax
-# endif
-	/* Mask out any matching bits after the nul CHAR.  */
-	movl	%ecx, %r8d
-	subl	$1, %r8d
-	xorl	%ecx, %r8d
-	andl	%r8d, %eax
+	jz	L(second_aligned_loop)
+	vpmovmskb %ymm1, %ecx
+	testl	%ecx, %ecx
+	jz	L(second_aligned_loop_set_furthest_match)
+	vpmovmskb %ymm5, %eax
 	testl	%eax, %eax
-	/* Return null pointer if the nul CHAR comes first.  */
-	jz	L(return_null)
-	bsrl	%eax, %eax
-	leaq	-VEC_SIZE(%rdi, %rax), %rax
+	jnz	L(return_new_match)
+
+	/* This is the hot patch. We know CHAR is inbounds and that
+	   ymm3/ymm2 have latest match.  */
+	.p2align 4,, 4
+L(return_old_match):
+	vpmovmskb %ymm3, %eax
+	vpmovmskb %ymm2, %edx
+	salq	$32, %rax
+	orq	%rdx, %rax
+	bsrq	%rax, %rax
+	/* Search char cannot be zero so safe to just use lea for
+	   wcsrchr.  */
+	leaq	(VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rsi, %rax), %rax
 	VZEROUPPER_RETURN
 
-	.p2align 4
-L(return_null):
-	xorl	%eax, %eax
+	/* Last iteration also potentially has a match.  */
+	.p2align 4,, 8
+L(return_new_match):
+	VPCMPEQ	%ymm4, %ymm0, %ymm4
+	vpmovmskb %ymm4, %edx
+	salq	$32, %rcx
+	orq	%rdx, %rcx
+
+	vpmovmskb %ymm10, %eax
+	vpmovmskb %ymm6, %edx
+	salq	$32, %rax
+	orq	%rdx, %rax
+	blsmskq	%rcx, %rcx
+	andq	%rcx, %rax
+	jz	L(return_old_match)
+	bsrq	%rax, %rax
+	/* Search char cannot be zero so safe to just use lea for
+	   wcsrchr.  */
+	leaq	(VEC_SIZE * -2 -(CHAR_SIZE - 1))(%rdi, %rax), %rax
 	VZEROUPPER_RETURN
 
-END (STRRCHR)
+	.p2align 4,, 4
+L(cross_page):
+	movq	%rdi, %rsi
+	andq	$-VEC_SIZE, %rsi
+	vmovdqu	(%rsi), %ymm1
+	VPCMPEQ	%ymm1, %ymm0, %ymm6
+	vpmovmskb %ymm6, %ecx
+	shrxl	%edi, %ecx, %ecx
+	testl	%ecx, %ecx
+	jz	L(page_cross_continue)
+	VPCMPEQ	%ymm1, %ymm7, %ymm1
+	vpmovmskb %ymm1, %eax
+	shrxl	%edi, %eax, %eax
+	blsmskl	%ecx, %ecx
+	andl	%ecx, %eax
+	jz	L(ret2)
+	bsrl	%eax, %eax
+	addq	%rdi, %rax
+# ifdef USE_AS_WCSRCHR
+	andq	$-CHAR_SIZE, %rax
+# endif
+L(ret2):
+	VZEROUPPER_RETURN
+END(STRRCHR)
 #endif