diff mbox series

[v4] aarch64: Add TX3 machine model

Message ID 20200424172344.GA11069@bell-sw.com
State New
Headers show
Series [v4] aarch64: Add TX3 machine model | expand

Commit Message

Anton Youdkevitch April 24, 2020, 5:23 p.m. UTC
Here is the patch introducing thunderxt311 maching model
for the scheduler. A name for the new chip was added to the
list of the names to be recognized as a valid parameter for mcpu
and mtune flags. The TX2 cost model was reused for TX3.

The previously used "cryptic" name for the command line
parameter is replaced with the same "thunderxt311" name.

Added the new chip name to the documentation. Fixed
copyright names and dates.

Lowering the chip capabilities to v8.3 to be on the
safe side.

Bootstrapped on AArch64.

2020-04-23 Anton Youdkevitch <anton.youdkevitch@bell-sw.com>

        * config/aarch64/aarch64-cores.def: Add the chip name.
        * config/aarch64/aarch64-tune.md: Regenerated.
        * config/aarch64/aarch64.c: Add the cost tables for the chip.
        * config/aarch64/thunderx3t11.md: New file: add the new
        machine model for the scheduler
        * config/aarch64/aarch64.md: Include the new model.
	* doc/invoke.texi: Add the new name to the list

---
gcc/config/aarch64/aarch64-cores.def |   3 +
gcc/config/aarch64/aarch64-tune.md   |   2 +-
gcc/config/aarch64/aarch64.c         |  27 ++
gcc/config/aarch64/aarch64.md        |   1 +
gcc/config/aarch64/thunderx3t11.md   | 686 +++++++++++++++++++++++++++++++++
gcc/doc/invoke.texi                  |   2 +-
6 files changed, 719 insertions(+), 2 deletions(-)
diff mbox series

Patch

From 4c2c69ab7327bda62d2380139cd2825bcc647988 Mon Sep 17 00:00:00 2001
From: Anton Youdkevitch <anton.youdkevitch@bell-sw.com>
Date: Mon, 23 Mar 2020 13:22:35 -0700
Subject: [PATCH] TX3 scheduling and tuning implementation

Added the scheduler descriptions for TX3 and the
cost tables borrowed them from TX2.
---
 gcc/config/aarch64/aarch64-cores.def |   3 +
 gcc/config/aarch64/aarch64-tune.md   |   2 +-
 gcc/config/aarch64/aarch64.c         |  27 ++
 gcc/config/aarch64/aarch64.md        |   1 +
 gcc/config/aarch64/thunderx3t11.md   | 686 +++++++++++++++++++++++++++++++++++
 gcc/doc/invoke.texi                  |   2 +-
 6 files changed, 719 insertions(+), 2 deletions(-)
 create mode 100644 gcc/config/aarch64/thunderx3t11.md

diff --git a/gcc/config/aarch64/aarch64-cores.def b/gcc/config/aarch64/aarch64-cores.def
index ea9b98b..4269c6c 100644
--- a/gcc/config/aarch64/aarch64-cores.def
+++ b/gcc/config/aarch64/aarch64-cores.def
@@ -95,6 +95,9 @@  AARCH64_CORE("vulcan",  vulcan, thunderx2t99, 8_1A,  AARCH64_FL_FOR_ARCH8_1 | AA
 /* Cavium ('C') cores. */
 AARCH64_CORE("thunderx2t99",  thunderx2t99,  thunderx2t99, 8_1A,  AARCH64_FL_FOR_ARCH8_1 | AARCH64_FL_CRYPTO, thunderx2t99, 0x43, 0x0af, -1)
 
+/* Marvell cores (TX3). */
+AARCH64_CORE("thunderx3t11",  thunderx3t11,  thunderx3t11, 8_3A,  AARCH64_FL_FOR_ARCH8_3 | AARCH64_FL_CRYPTO | AARCH64_FL_RCPC | AARCH64_FL_SM4 | AARCH64_FL_SHA3 | AARCH64_FL_F16FML | AARCH64_FL_RCPC8_4, thunderx3t11, 0x43, 0x0b8, 0x0a)
+
 /* ARMv8.2-A Architecture Processors.  */
 
 /* ARM ('A') cores. */
diff --git a/gcc/config/aarch64/aarch64-tune.md b/gcc/config/aarch64/aarch64-tune.md
index 3cc1c4d..573a4a9 100644
--- a/gcc/config/aarch64/aarch64-tune.md
+++ b/gcc/config/aarch64/aarch64-tune.md
@@ -1,5 +1,5 @@ 
 ;; -*- buffer-read-only: t -*-
 ;; Generated automatically by gentune.sh from aarch64-cores.def
 (define_attr "tune"
-	"cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa65,cortexa65ae,ares,neoversen1,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,tsv110,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55"
+	"cortexa34,cortexa35,cortexa53,cortexa57,cortexa72,cortexa73,thunderx,thunderxt88p1,thunderxt88,octeontx,octeontxt81,octeontxt83,thunderxt81,thunderxt83,emag,xgene1,falkor,qdf24xx,exynosm1,phecda,thunderx2t99p1,vulcan,thunderx2t99,thunderx3t11,cortexa55,cortexa75,cortexa76,cortexa76ae,cortexa77,cortexa65,cortexa65ae,ares,neoversen1,neoversee1,octeontx2,octeontx2t98,octeontx2t96,octeontx2t93,octeontx2f95,octeontx2f95n,octeontx2f95mm,tsv110,saphira,cortexa57cortexa53,cortexa72cortexa53,cortexa73cortexa35,cortexa73cortexa53,cortexa75cortexa55,cortexa76cortexa55"
 	(const (symbol_ref "((enum attr_tune) aarch64_tune)")))
diff --git a/gcc/config/aarch64/aarch64.c b/gcc/config/aarch64/aarch64.c
index 24c055d..7abce6a 100644
--- a/gcc/config/aarch64/aarch64.c
+++ b/gcc/config/aarch64/aarch64.c
@@ -1216,6 +1216,33 @@  static const struct tune_params thunderx2t99_tunings =
   &thunderx2t99_prefetch_tune
 };
 
+static const struct tune_params thunderx3t11_tunings =
+{
+  &thunderx2t99_extra_costs,
+  &thunderx2t99_addrcost_table,
+  &thunderx2t99_regmove_cost,
+  &thunderx2t99_vector_cost,
+  &generic_branch_cost,
+  &generic_approx_modes,
+  SVE_NOT_IMPLEMENTED, /* sve_width  */
+  4, /* memmov_cost.  */
+  4, /* issue_rate.  */
+  (AARCH64_FUSE_ALU_BRANCH | AARCH64_FUSE_AES_AESMC
+   | AARCH64_FUSE_ALU_CBZ), /* fusible_ops  */
+  "16",	/* function_align.  */
+  "8",	/* jump_align.  */
+  "16",	/* loop_align.  */
+  3,	/* int_reassoc_width.  */
+  2,	/* fp_reassoc_width.  */
+  2,	/* vec_reassoc_width.  */
+  2,	/* min_div_recip_mul_sf.  */
+  2,	/* min_div_recip_mul_df.  */
+  0,	/* max_case_values.  */
+  tune_params::AUTOPREFETCHER_WEAK,	/* autoprefetcher_model.  */
+  (AARCH64_EXTRA_TUNE_NONE),	/* tune_flags.  */
+  &thunderx2t99_prefetch_tune
+};
+
 static const struct tune_params neoversen1_tunings =
 {
   &cortexa57_extra_costs,
diff --git a/gcc/config/aarch64/aarch64.md b/gcc/config/aarch64/aarch64.md
index c7c4d1d..d2123f8 100644
--- a/gcc/config/aarch64/aarch64.md
+++ b/gcc/config/aarch64/aarch64.md
@@ -438,6 +438,7 @@ 
 (include "../arm/xgene1.md")
 (include "thunderx2t99.md")
 (include "tsv110.md")
+(include "thunderx3t11.md")
 
 ;; -------------------------------------------------------------------
 ;; Jumps and other miscellaneous insns
diff --git a/gcc/config/aarch64/thunderx3t11.md b/gcc/config/aarch64/thunderx3t11.md
new file mode 100644
index 0000000..91cf1e0
--- /dev/null
+++ b/gcc/config/aarch64/thunderx3t11.md
@@ -0,0 +1,686 @@ 
+;; Cavium ThunderX 3 CN11xx pipeline description
+;; Copyright (C) 2020 Free Software Foundation, Inc.
+;;
+;; Contributed by Marvell
+
+;; This file is part of GCC.
+
+;; GCC is free software; you can redistribute it and/or modify
+;; it under the terms of the GNU General Public License as published by
+;; the Free Software Foundation; either version 3, or (at your option)
+;; any later version.
+
+;; GCC is distributed in the hope that it will be useful,
+;; but WITHOUT ANY WARRANTY; without even the implied warranty of
+;; MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+;; GNU General Public License for more details.
+
+;; You should have received a copy of the GNU General Public License
+;; along with GCC; see the file COPYING3.  If not see
+;; <http://www.gnu.org/licenses/>.
+
+(define_automaton "thunderx3t11, thunderx3t11_advsimd, thunderx3t11_ldst")
+(define_automaton "thunderx3t11_mult")
+
+(define_cpu_unit "thunderx3t11_i0" "thunderx3t11")
+(define_cpu_unit "thunderx3t11_i1" "thunderx3t11")
+(define_cpu_unit "thunderx3t11_i2" "thunderx3t11")
+(define_cpu_unit "thunderx3t11_i3" "thunderx3t11")
+
+(define_cpu_unit "thunderx3t11_ls0" "thunderx3t11_ldst")
+(define_cpu_unit "thunderx3t11_ls1" "thunderx3t11_ldst")
+(define_cpu_unit "thunderx3t11_sd" "thunderx3t11_ldst")
+
+; Pseudo-units for multiply pipeline.
+; unchanged from TX2, occupies I1 for four (1 + 3 additional) slots
+
+(define_cpu_unit "thunderx3t11_i1m1" "thunderx3t11_mult")
+(define_cpu_unit "thunderx3t11_i1m2" "thunderx3t11_mult")
+(define_cpu_unit "thunderx3t11_i1m3" "thunderx3t11_mult")
+
+; Pseudo-units for load delay (assuming dcache hit).
+
+(define_cpu_unit "thunderx3t11_ls0d1" "thunderx3t11_ldst")
+(define_cpu_unit "thunderx3t11_ls0d2" "thunderx3t11_ldst")
+(define_cpu_unit "thunderx3t11_ls0d3" "thunderx3t11_ldst")
+
+(define_cpu_unit "thunderx3t11_ls1d1" "thunderx3t11_ldst")
+(define_cpu_unit "thunderx3t11_ls1d2" "thunderx3t11_ldst")
+(define_cpu_unit "thunderx3t11_ls1d3" "thunderx3t11_ldst")
+
+; Define FP units f0/f1/f2/f3.
+(define_cpu_unit "thunderx3t11_f0" "thunderx3t11_advsimd")
+(define_cpu_unit "thunderx3t11_f1" "thunderx3t11_advsimd")
+(define_cpu_unit "thunderx3t11_f2" "thunderx3t11_advsimd")
+(define_cpu_unit "thunderx3t11_f3" "thunderx3t11_advsimd")
+
+(define_reservation "thunderx3t11_i23" "thunderx3t11_i2|thunderx3t11_i3")
+(define_reservation "thunderx3t11_i01"
+    "thunderx3t11_i0|thunderx3t11_i1")
+(define_reservation "thunderx3t11_i012"
+    "thunderx3t11_i0|thunderx3t11_i1|thunderx3t11_i2")
+(define_reservation "thunderx3t11_i0123"
+    "thunderx3t11_i0|thunderx3t11_i1|thunderx3t11_i2|thunderx3t11_i3")
+(define_reservation "thunderx3t11_ls01" "thunderx3t11_ls0|thunderx3t11_ls1")
+(define_reservation "thunderx3t11_f01" "thunderx3t11_f0|thunderx3t11_f1")
+(define_reservation "thunderx3t11_f23" "thunderx3t11_f2|thunderx3t11_f3")
+(define_reservation "thunderx3t11_f0123"
+    "thunderx3t11_f0|thunderx3t11_f1|thunderx3t11_f2|thunderx3t11_f3")
+
+; A load with delay in the ls0/ls1 pipes.
+; this is always a delay of four
+(define_reservation "thunderx3t11_l0delay"
+    "thunderx3t11_ls0,thunderx3t11_ls0d1,thunderx3t11_ls0d2,\
+     thunderx3t11_ls0d3")
+(define_reservation "thunderx3t11_l1delay"
+    "thunderx3t11_ls1,thunderx3t11_ls1d1,thunderx3t11_ls1d2,\
+     thunderx3t11_ls1d3")
+(define_reservation "thunderx3t11_l01delay"
+    "thunderx3t11_l0delay|thunderx3t11_l1delay")
+;; Branch and call instructions.
+
+(define_insn_reservation "thunderx3t11_branch" 1
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "call,branch,trap"))
+  "thunderx3t11_i23")
+
+;; Misc instructions.
+
+; Speculation barrier
+(define_insn_reservation "thunderx3t11_nothing" 0
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "block"))
+  "nothing")
+
+(define_insn_reservation "thunderx3t11_mrs" 0
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "mrs"))
+  "thunderx3t11_i2")
+
+(define_insn_reservation "thunderx3t11_multiple" 1
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "multiple"))
+  "thunderx3t11_i0+thunderx3t11_i1+thunderx3t11_i3+thunderx3t11_ls0+\
+   thunderx3t11_ls1+thunderx3t11_sd+thunderx3t11_i1m1+thunderx3t11_i1m2+\
+   thunderx3t11_i1m3+thunderx3t11_f0+thunderx3t11_f1")
+
+;; Integer arithmetic/logic instructions.
+
+; Plain register moves are handled by renaming,
+; and don't create any uops.
+(define_insn_reservation "thunderx3t11_regmove" 0
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "mov_reg"))
+  "nothing")
+
+(define_insn_reservation "thunderx3t11_alu_basic" 1
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "alu_imm,alu_sreg,alus_imm,alus_sreg,\
+			adc_reg,adc_imm,adcs_reg,adcs_imm,\
+			logic_reg,logic_imm,logics_reg,logics_imm,\
+			csel,adr,mov_imm,shift_reg,shift_imm,bfm,\
+			bfx,rbit,rev,extend,rotate_imm"))
+  "thunderx3t11_i0123")
+
+; distinguish between latency 1|2 and throughput 1/4|2/4?
+; is it actually 1,1/2,{i0,i1} vs 2,1/4,{i0,i1,i2,i3}
+(define_insn_reservation "thunderx3t11_alu_shift" 2
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "alu_shift_imm,alu_ext,\
+			alus_shift_imm,alus_ext,\
+			logic_shift_imm,logics_shift_imm"))
+  "thunderx3t11_i0123")
+
+(define_insn_reservation "thunderx3t11_alu_shift1" 1
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "alu_shift_imm,alu_ext,\
+			alus_shift_imm,alus_ext,\
+			logic_shift_imm,logics_shift_imm"))
+  "thunderx3t11_i01")
+
+; we are going for the the optimistic answer (13)
+; for now, the worst case is 23
+(define_insn_reservation "thunderx3t11_div" 13
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "sdiv,udiv"))
+  "thunderx3t11_i1*3")
+
+(define_insn_reservation "thunderx3t11_madd" 5
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "mla,smlal,umlal"))
+  "thunderx3t11_i0123,thunderx3t11_i1m1,thunderx3t11_i1m2,thunderx3t11_i1m3,\
+   thunderx3t11_i012")
+
+; NOTE: smull, umull are used for "high part" multiplies too.
+; mul is alias for MADD
+; it has to be distinguished between smulh, umulh (4,1) and
+; other (5,1) but there is no such a type, so, we go for the
+; conservative approach of (5,1) for now
+; smulh, umulh only runs on I1
+(define_insn_reservation "thunderx3t11_mul" 5
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "mul,smull,umull"))
+  "thunderx3t11_i0123,thunderx3t11_i1m1,thunderx3t11_i1m2,thunderx3t11_i1m3")
+
+(define_insn_reservation "thunderx3t11_countbits" 3
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "clz"))
+  "thunderx3t11_i1")
+
+;; Integer loads and stores.
+
+; load_4 matches prefetch, a multitude of move/str/dup variants,
+; sign extend
+(define_insn_reservation "thunderx3t11_load_basic" 4
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "load_4"))
+  "thunderx3t11_ls01")
+
+; model use of I0/I1/I2 for index versions only, model 4|8 2nd on load
+(define_insn_reservation "thunderx3t11_loadpair" 5
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "load_8,load_16"))
+  "thunderx3t11_i012,thunderx3t11_ls01")
+
+(define_insn_reservation "thunderx3t11_store_basic" 1
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "store_4"))
+  "thunderx3t11_ls01,thunderx3t11_sd")
+
+; model use of I0/I1/I2/I3 for index versions, model differing
+; throughputs
+(define_insn_reservation "thunderx3t11_storepair_basic" 1
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "store_8,store_16"))
+  "thunderx3t11_ls01,thunderx3t11_sd")
+
+;; FP data processing instructions.
+
+(define_insn_reservation "thunderx3t11_fp_simple" 5
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "ffariths,ffarithd,f_minmaxs,f_minmaxd"))
+  "thunderx3t11_f0123")
+
+; distinguish latency 3/4 throughput 1/2|1/4
+(define_insn_reservation "thunderx3t11_fp_addsub3" 3
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "fadds,faddd"))
+  "thunderx3t11_f23")
+(define_insn_reservation "thunderx3t11_fp_addsub4" 4
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "fadds,faddd"))
+  "thunderx3t11_f0123")
+
+(define_insn_reservation "thunderx3t11_fp_cmp" 4
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "fcmps,fcmpd,fccmps,fccmpd"))
+  "thunderx3t11_f0123")
+
+; need to split out latency 23 throughput 23/4: F64 from
+; latency 16 throughput  16/4: FDIV F32
+(define_insn_reservation "thunderx3t11_fp_divsqrt_s" 16
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "fdivs,fsqrts"))
+  "thunderx3t11_f0*3|thunderx3t11_f1*3|\
+   thunderx3t11_f2*3|thunderx3t11_f3*3")
+
+(define_insn_reservation "thunderx3t11_fp_divsqrt_d" 23
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "fdivd,fsqrtd"))
+  "thunderx3t11_f0*5|thunderx3t11_f1*5|\
+   thunderx3t11_f2*5|thunderx3t11_f3*5")
+
+(define_insn_reservation "thunderx3t11_fp_mul_mac" 5
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "fmuls,fmuld,fmacs,fmacd"))
+  "thunderx3t11_f01")
+
+(define_insn_reservation "thunderx3t11_frint" 5
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "f_rints,f_rintd"))
+  "thunderx3t11_f0123")
+
+; mimic latency 3|4 throughput 1/2|1/4
+(define_insn_reservation "thunderx3t11_fcsel3" 3
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "fcsel"))
+  "thunderx3t11_f23")
+
+(define_insn_reservation "thunderx3t11_fcsel4" 4
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "fcsel"))
+  "thunderx3t11_f0123")
+
+;; FP miscellaneous instructions.
+
+(define_insn_reservation "thunderx3t11_fp_cvt" 5
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "f_cvtf2i,f_cvt,f_cvti2f"))
+  "thunderx3t11_f0123")
+
+; even though f_mrc has to belong to fp_mov_to_gen
+; we retain this for the sake of legace as codegen
+; doesn't use it anyway
+(define_insn_reservation "thunderx3t11_fp_mov3" 3
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "fconsts,fconstd,fmov,f_mrc"))
+  "thunderx3t11_f23")
+
+(define_insn_reservation "thunderx3t11_fp_mov" 4
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "fconsts,fconstd,fmov,f_mrc"))
+  "thunderx3t11_f0123")
+
+(define_insn_reservation "thunderx3t11_fp_mov_to_gen" 4
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "f_mcr"))
+  "thunderx3t11_f0123")
+
+;; FP loads and stores.
+;  model use of I0/I1/I2 for post/pre index modes
+
+(define_insn_reservation "thunderx3t11_fp_load_basic" 4
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "f_loads,f_loadd"))
+  "thunderx3t11_ls01")
+
+; model throughput 1
+(define_insn_reservation "thunderx3t11_fp_store_basic" 1
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "f_stores,f_stored"))
+  "thunderx3t11_ls01,thunderx3t11_sd")
+
+;; ASIMD integer instructions.
+
+(define_insn_reservation "thunderx3t11_asimd_int" 5
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "neon_abd,neon_abd_q,\
+			neon_arith_acc,neon_arith_acc_q,\
+			neon_abs,neon_abs_q,\
+			neon_add,neon_add_q,\
+			neon_sub,neon_sub_q,\
+			neon_neg,neon_neg_q,\
+			neon_add_long,neon_add_widen,\
+			neon_add_halve,neon_add_halve_q,\
+			neon_sub_long,neon_sub_widen,\
+			neon_sub_halve,neon_sub_halve_q,\
+			neon_add_halve_narrow_q,neon_sub_halve_narrow_q,\
+			neon_qabs,neon_qabs_q,\
+			neon_qadd,neon_qadd_q,\
+			neon_qneg,neon_qneg_q,\
+			neon_qsub,neon_qsub_q,\
+			neon_minmax,neon_minmax_q,\
+			neon_reduc_minmax,neon_reduc_minmax_q,\
+			neon_mul_b,neon_mul_h,neon_mul_s,\
+			neon_mul_b_q,neon_mul_h_q,neon_mul_s_q,\
+			neon_sat_mul_b,neon_sat_mul_h,neon_sat_mul_s,\
+			neon_sat_mul_b_q,neon_sat_mul_h_q,neon_sat_mul_s_q,\
+			neon_mla_b,neon_mla_h,neon_mla_s,\
+			neon_mla_b_q,neon_mla_h_q,neon_mla_s_q,\
+			neon_mul_b_long,neon_mul_h_long,\
+			neon_mul_s_long,neon_mul_d_long,\
+			neon_sat_mul_b_long,neon_sat_mul_h_long,\
+			neon_sat_mul_s_long,\
+			neon_mla_b_long,neon_mla_h_long,neon_mla_s_long,\
+			neon_sat_mla_b_long,neon_sat_mla_h_long,\
+			neon_sat_mla_s_long,\
+			neon_shift_acc,neon_shift_acc_q,\
+			neon_shift_imm,neon_shift_imm_q,\
+			neon_shift_reg,neon_shift_reg_q,\
+			neon_shift_imm_long,neon_shift_imm_narrow_q,\
+			neon_sat_shift_imm,neon_sat_shift_imm_q,\
+			neon_sat_shift_reg,neon_sat_shift_reg_q,\
+			neon_sat_shift_imm_narrow_q"))
+  "thunderx3t11_f0123")
+
+; neon_reduc_add is used for both addp and [su]adalp
+(define_insn_reservation "thunderx3t11_asimd_reduc_add" 5
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "neon_reduc_add,neon_reduc_add_q"))
+  "thunderx3t11_f01")
+
+(define_insn_reservation "thunderx3t11_asimd_cmp" 5
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "neon_compare,neon_compare_q,neon_compare_zero,\
+			neon_tst,neon_tst_q"))
+  "thunderx3t11_f0123")
+
+; neon_logic used in ldr, str, mov, umov, fmov, mov; orn; bic; and,
+;   simd mov immediate; orr, simd mov immediate; eor; not (mvn)
+; latency 4 throughput 1/2 LS0/LS1: ldr
+; latency 1 throughput 1 LS0/LS1,SDI,I0/I1/I2: str
+; latency 3|4 throughput 1/2|1/4 F2/F3 F0/F1/F2/F3: fmov immed, orn,
+;   bic, and, orr, eor, not (mvn)
+; latency 4 throughput 1/4 F0/F1/F2/F3: fmov register, fmov gen to vec
+; latency 5 throughput 1/4 F0/F1/F2/F3: fmov vec to gen, umov, fmov
+(define_insn_reservation "thunderx3t11_asimd_logic4" 4
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "neon_logic,neon_logic_q"))
+  "thunderx3t11_f23")
+
+(define_insn_reservation "thunderx3t11_asimd_logic5" 5
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "neon_logic,neon_logic_q"))
+  "thunderx3t11_f0123")
+
+;; ASIMD floating-point instructions.
+
+; Distinguish between latency 5 throughput 1/4: fabs, fmax, fmin, fneg
+; latency 4 throughput 1/4: fcmp
+(define_insn_reservation "thunderx3t11_asimd_fp_simple" 5
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "neon_fp_abs_s,neon_fp_abs_d,\
+			neon_fp_abs_s_q,neon_fp_abs_d_q,\
+			neon_fp_compare_s,neon_fp_compare_d,\
+			neon_fp_compare_s_q,neon_fp_compare_d_q,\
+			neon_fp_minmax_s,neon_fp_minmax_d,\
+			neon_fp_minmax_s_q,neon_fp_minmax_d_q,\
+			neon_fp_reduc_minmax_s,neon_fp_reduc_minmax_d,\
+			neon_fp_reduc_minmax_s_q,neon_fp_reduc_minmax_d_q,\
+			neon_fp_neg_s,neon_fp_neg_d,\
+			neon_fp_neg_s_q,neon_fp_neg_d_q"))
+  "thunderx3t11_f0123")
+
+; distinguish between latency 3 throughput 1/2,
+; latency 4 throughput 1/4
+; neon_fp_reduc_add_<stype><q> is used for both faddp and
+; vector reduction add. On TX3, faddp is 3|4 1/2|1/4 and reduction is 5 1/4
+(define_insn_reservation "thunderx3t11_asimd_fp_arith3" 3
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "neon_fp_abd_s,neon_fp_abd_d,\
+			neon_fp_abd_s_q,neon_fp_abd_d_q,\
+			neon_fp_addsub_s,neon_fp_addsub_d,\
+			neon_fp_addsub_s_q,neon_fp_addsub_d_q,\
+			neon_fp_reduc_add_s,neon_fp_reduc_add_d,\
+			neon_fp_reduc_add_s_q,neon_fp_reduc_add_d_q"))
+  "thunderx3t11_f23")
+
+(define_insn_reservation "thunderx3t11_asimd_fp_arith4" 4
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "neon_fp_abd_s,neon_fp_abd_d,\
+			neon_fp_abd_s_q,neon_fp_abd_d_q,\
+			neon_fp_addsub_s,neon_fp_addsub_d,\
+			neon_fp_addsub_s_q,neon_fp_addsub_d_q,\
+			neon_fp_reduc_add_s,neon_fp_reduc_add_d,\
+			neon_fp_reduc_add_s_q,neon_fp_reduc_add_d_q"))
+  "thunderx3t11_f0123")
+
+(define_insn_reservation "thunderx3t11_asimd_fp_arith5" 5
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "neon_fp_mul_s,neon_fp_mul_d,\
+			neon_fp_mul_s_q,neon_fp_mul_d_q,\
+			neon_fp_mul_s_scalar_q,neon_fp_mul_d_scalar_q,\
+			neon_fp_mla_s,neon_fp_mla_d,\
+			neon_fp_mla_s_q,neon_fp_mla_d_q"))
+  "thunderx3t11_f0123")
+
+; neon_fp_cvt_widen_s,neon_fp_cvt_narrow_d_q: fcvtl,fctvl2,fcvtn,fcvtn2
+; neon_fp_to_int_s,neon_fp_to_int_d: fcvt{<frint_suffix><su>,z<su>}
+;   where frint_suffix: zpmixan, su: su (plus other sign/unsign/extract...
+; neon_fp_to_int_s_q,neon_fp_to_int_d_q: fcvtz<su> other
+; The int_to_fp* is complicated
+;   neon_int_to_fp_s,neon_int_to_fp_d: <su_optab>cvtf
+;   neon_int_to_fp_s_q,neon_int_to_fp_d_q
+; Round matches single define_insn, frint<frint_suffix>
+;   neon_fp_round_s,neon_fp_round_d,neon_fp_round_s_q,
+;   neon_fp_round_d_q: frint<frint_suffix>
+; FCVT*,VCVTAU,[SU]CVTF: latency 5 throughput 1/4
+; FRINT*: latency 5 throughput 1/4
+(define_insn_reservation "thunderx3t11_asimd_fp_conv" 5
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "neon_fp_cvt_widen_s,neon_fp_cvt_narrow_d_q,\
+			neon_fp_to_int_s,neon_fp_to_int_d,\
+			neon_fp_to_int_s_q,neon_fp_to_int_d_q,\
+			neon_int_to_fp_s,neon_int_to_fp_d,\
+			neon_int_to_fp_s_q,neon_int_to_fp_d_q,\
+			neon_fp_round_s,neon_fp_round_d,\
+			neon_fp_round_s_q,neon_fp_round_d_q"))
+  "thunderx3t11_f0123")
+
+; model that pipeline is occupied the whole time D/F32, Q/F32: 16/4
+; Q/F64: 23/4
+(define_insn_reservation "thunderx3t11_asimd_fp_div_s" 16
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "neon_fp_div_s,neon_fp_div_s_q"))
+  "thunderx3t11_f0123")
+
+(define_insn_reservation "thunderx3t11_asimd_fp_div_d" 23
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "neon_fp_div_d,neon_fp_div_d_q"))
+  "thunderx3t11_f0123")
+
+;; ASIMD miscellaneous instructions.
+
+;  divided out:
+;  rbit,bsl,bsl_q,cls,cls_q,cnt,cnt_q,move,move_q: 3|4 1/2 | 1/4
+;  from_gp,from_gp_q : 4 | 1/4
+;  dup,dup_q,ext,ext_q,ins,ins_q,all recpe forms, rev,rev_q: 5 1/4
+;  permute,permute_q needs to depend on aarch64_expand_vec_perm_const does
+;  on TX3
+(define_insn_reservation "thunderx3t11_asimd_misc3" 3
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "neon_rbit,\
+			neon_bsl,neon_bsl_q,\
+			neon_cls,neon_cls_q,\
+			neon_cnt,neon_cnt_q,\
+			neon_move,neon_move_q"))
+  "thunderx3t11_f23")
+
+(define_insn_reservation "thunderx3t11_asimd_misc4" 4
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "neon_rbit,\
+			neon_bsl,neon_bsl_q,\
+			neon_cls,neon_cls_q,\
+			neon_cnt,neon_cnt_q,\
+			neon_from_gp,neon_from_gp_q,\
+			neon_move,neon_move_q"))
+  "thunderx3t11_f0123")
+
+(define_insn_reservation "thunderx3t11_asimd_misc" 5
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "
+			neon_dup,neon_dup_q,\
+			neon_ext,neon_ext_q,\
+			neon_ins,neon_ins_q,\
+			neon_move,neon_move_q,\
+			neon_fp_recpe_s,neon_fp_recpe_d,\
+			neon_fp_recpe_s_q,neon_fp_recpe_d_q,\
+			neon_fp_recpx_s,neon_fp_recpx_d,\
+			neon_fp_recpx_s_q,neon_fp_recpx_d_q,\
+			neon_rev,neon_rev_q,\
+			neon_permute,neon_permute_q"))
+  "thunderx3t11_f0123")
+
+(define_insn_reservation "thunderx3t11_asimd_recip_step" 5
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "neon_fp_recps_s,neon_fp_recps_s_q,\
+			neon_fp_recps_d,neon_fp_recps_d_q,\
+			neon_fp_sqrt_s,neon_fp_sqrt_s_q,\
+			neon_fp_sqrt_d,neon_fp_sqrt_d_q,\
+			neon_fp_rsqrte_s, neon_fp_rsqrte_s_q,\
+			neon_fp_rsqrte_d, neon_fp_rsqrte_d_q,\
+			neon_fp_rsqrts_s, neon_fp_rsqrts_s_q,\
+			neon_fp_rsqrts_d, neon_fp_rsqrts_d_q"))
+  "thunderx3t11_f0123")
+
+(define_insn_reservation "thunderx3t11_asimd_lut1" 5
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "neon_tbl1,neon_tbl1_q"))
+  "thunderx3t11_f0123")
+
+(define_insn_reservation "thunderx3t11_asimd_lut2" 10
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "neon_tbl2,neon_tbl2_q"))
+  "thunderx3t11_f0123")
+
+(define_insn_reservation "thunderx3t11_asimd_lut3" 15
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "neon_tbl3,neon_tbl3_q"))
+  "thunderx3t11_f0123")
+
+(define_insn_reservation "thunderx3t11_asimd_lut4" 20
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "neon_tbl4,neon_tbl4_q"))
+  "thunderx3t11_f0123")
+
+(define_insn_reservation "thunderx3t11_asimd_elt_to_gr" 5
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "neon_to_gp,neon_to_gp_q"))
+  "thunderx3t11_f0123")
+
+;; ASIMD load instructions.
+
+; NOTE: These reservations attempt to model latency and throughput
+; correctly, but the cycle timing of unit allocation is not
+; necessarily accurate (because insns are split into uops, and those
+; may be issued out-of-order).
+
+; the LDP/LDNP imm-offset S/D/Q suppplies the first arg with latency 4
+; and the 2nd at 5 (Q form) or 8 (S/D form). Can this be modeled? These
+; forms also do not appear to use the I0/I1/I2 units (no I3), but the
+; other LDP ones do.
+(define_insn_reservation "thunderx3t11_asimd_load1_ldp" 5
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "neon_ldp,neon_ldp_q"))
+  "thunderx3t11_i012,thunderx3t11_ls01")
+
+; Need to distinguish latency 6 throughput 2: 4 reg D/Q
+; latency 5 throughput 3/2: 3 reg D/Q
+; latency 4 throughput 1: 2 reg D/Q
+; latency 4 throughput 1/2: 1 reg D/Q
+(define_insn_reservation "thunderx3t11_asimd_load1" 4
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "neon_load1_1reg,neon_load1_1reg_q,\
+			neon_load1_2reg,neon_load1_2reg_q,\
+			neon_load1_3reg,neon_load1_3reg_q,\
+			neon_load1_4reg,neon_load1_4reg_q"))
+  "thunderx3t11_ls01")
+
+(define_insn_reservation "thunderx3t11_asimd_load1_onelane" 5
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "neon_load1_one_lane,neon_load1_one_lane_q"))
+  "thunderx3t11_l01delay,thunderx3t11_f0123")
+
+(define_insn_reservation "thunderx3t11_asimd_load1_all" 5
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "neon_load1_all_lanes,neon_load1_all_lanes_q"))
+  "thunderx3t11_l01delay,thunderx3t11_f0123")
+
+(define_insn_reservation "thunderx3t11_asimd_load2" 5
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "neon_load2_2reg,neon_load2_2reg_q,\
+			neon_load2_one_lane,neon_load2_one_lane_q,\
+			neon_load2_all_lanes,neon_load2_all_lanes_q"))
+  "thunderx3t11_l01delay,thunderx3t11_f0123")
+
+(define_insn_reservation "thunderx3t11_asimd_load3" 7
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "neon_load3_3reg,neon_load3_3reg_q,\
+			neon_load3_one_lane,neon_load3_one_lane_q,\
+			neon_load3_all_lanes,neon_load3_all_lanes_q"))
+  "thunderx3t11_l01delay,thunderx3t11_f0123")
+
+(define_insn_reservation "thunderx3t11_asimd_load4" 8
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "neon_load4_4reg,neon_load4_4reg_q,\
+			neon_load4_one_lane,neon_load4_one_lane_q,\
+			neon_load4_all_lanes,neon_load4_all_lanes_q"))
+  "thunderx3t11_l01delay,thunderx3t11_f0123")
+
+;; ASIMD store instructions.
+
+; Same note applies as for ASIMD load instructions.
+
+; Vector Store pair Need to distinguish:
+; 5 throughput: imm-offset S/D; imm-postindex S/D; imm-preindex S/D
+; 2 throughput: imm-offset Q; imm-postindex Q; imm-preindex Q
+; all index modes use I0/I1/I2
+(define_insn_reservation "thunderx3t11_asimd_store_stp" 1
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "neon_stp,neon_stp_q"))
+  "thunderx3t11_ls01,thunderx3t11_sd")
+
+; There are multiple forms of ST1
+; Do the following two groups really not use the FP pipelines?
+; multiple, 1 reg, D-form     ST1
+; tx2_ltp:    x    1/2     LS0/LS1
+; tx3_ltp:    x    1/2     LS0/LS1
+; multiple, 1 reg, Q-form     ST1
+; tx2_ltp:    x    1/2     LS0/LS1
+; tx3_ltp:    x    1/2     LS0/LS1
+;
+; one lane, B/H/S         ST1
+; tx2_ltp:    x       1/2     LS0/LS1,F0/F1
+; tx3_ltp:    x       1/2     LS0/LS1,F0/F1/F2/F3
+; one lane, D             ST1
+; tx2_ltp:    x       1/2     LS0/LS1,F0/F1
+; tx3_ltp:    x       1/2     LS0/LS1,F0/F1/F2/F3
+;; CN11K missing entries for these st1 insn
+; multiple, 2 reg, D-form     ST1     x    1     LS0/LS1
+; multiple, 2 reg, Q-form     ST1     x    1     LS0/LS1
+; multiple, 3 reg, D-form     ST1     x    3/2     LS0/LS1
+; multiple, 3 reg, Q-form     ST1     x    3/2     LS0/LS1
+; multiple,4 reg, D-form         ST1     x    2     LS0/LS1
+; multiple,4 reg, Q-form         ST1     x    2     LS0/LS1
+(define_insn_reservation "thunderx3t11_asimd_store1" 1
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "neon_store1_1reg,neon_store1_1reg_q,\
+			neon_store1_2reg,neon_store1_2reg_q,\
+			neon_store1_3reg,neon_store1_4reg"))
+  "thunderx3t11_ls01")
+
+(define_insn_reservation "thunderx3t11_asimd_store1_onelane" 1
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "neon_store1_one_lane,neon_store1_one_lane_q"))
+  "thunderx3t11_ls01,thunderx3t11_f0123")
+
+; distinguish between throughput 1: D/Q-form B/H/S, Q-form D and
+; thoughput 1/2: one lane B/H/S/D
+(define_insn_reservation "thunderx3t11_asimd_store2" 1
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "neon_store2_2reg,neon_store2_2reg_q,\
+			neon_store2_one_lane,neon_store2_one_lane_q"))
+  "thunderx3t11_ls01,thunderx3t11_f0123")
+
+; distinguish between throughput 3: D/Q-form B/H/S, Q-form D and
+; thoughput 1: one lane B/H/S/D
+(define_insn_reservation "thunderx3t11_asimd_store3" 1
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "neon_store3_3reg,neon_store3_3reg_q,\
+			neon_store3_one_lane,neon_store3_one_lane_q"))
+  "thunderx3t11_ls01,thunderx3t11_f0123")
+
+; distinguish between throughput 4: D/Q-form B/H/S, Q-form D and
+; thoughput 1: one lane B/H/S/D? (not in doc)
+(define_insn_reservation "thunderx3t11_asimd_store4" 1
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "neon_store4_4reg,neon_store4_4reg_q,\
+			neon_store4_one_lane,neon_store4_one_lane_q"))
+  "thunderx3t11_ls01,thunderx3t11_f0123")
+
+;; Crypto extensions.
+
+(define_insn_reservation "thunderx3t11_aes" 4
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "crypto_aese,crypto_aesmc"))
+  "thunderx3t11_f0123")
+
+(define_insn_reservation "thunderx3t11_sha" 5
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "crypto_sha1_fast,crypto_sha1_xor,crypto_sha1_slow,\
+			crypto_sha256_fast,crypto_sha256_slow"))
+  "thunderx3t11_f0123")
+
+;; CRC extension.
+
+(define_insn_reservation "thunderx3t11_crc" 3
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "crc"))
+  "thunderx3t11_i1")
+
+;; PMULL extension.
+
+(define_insn_reservation "thunderx3t11_pmull" 5
+  (and (eq_attr "tune" "thunderx3t11")
+       (eq_attr "type" "crypto_pmull"))
+  "thunderx3t11_f0123")
diff --git a/gcc/doc/invoke.texi b/gcc/doc/invoke.texi
index 8b5cd82..00a5aec 100644
--- a/gcc/doc/invoke.texi
+++ b/gcc/doc/invoke.texi
@@ -16947,7 +16947,7 @@  performance of the code.  Permissible values for this option are:
 @samp{octeontx2f95mm}
 @samp{thunderx}, @samp{thunderxt88},
 @samp{thunderxt88p1}, @samp{thunderxt81}, @samp{tsv110},
-@samp{thunderxt83}, @samp{thunderx2t99},
+@samp{thunderxt83}, @samp{thunderx2t99}, @samp{thunderx3t11},
 @samp{cortex-a57.cortex-a53}, @samp{cortex-a72.cortex-a53},
 @samp{cortex-a73.cortex-a35}, @samp{cortex-a73.cortex-a53},
 @samp{cortex-a75.cortex-a55}, @samp{cortex-a76.cortex-a55}
-- 
2.7.4