test/CodeGen/AArch64/neon-simd-ldst-one.ll

   1 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
   2
   3 %struct.int8x16x2_t = type { [2 x <16 x i8>] }
   4 %struct.int16x8x2_t = type { [2 x <8 x i16>] }
   5 %struct.int32x4x2_t = type { [2 x <4 x i32>] }
   6 %struct.int64x2x2_t = type { [2 x <2 x i64>] }
   7 %struct.float32x4x2_t = type { [2 x <4 x float>] }
   8 %struct.float64x2x2_t = type { [2 x <2 x double>] }
   9 %struct.int8x8x2_t = type { [2 x <8 x i8>] }
  10 %struct.int16x4x2_t = type { [2 x <4 x i16>] }
  11 %struct.int32x2x2_t = type { [2 x <2 x i32>] }
  12 %struct.int64x1x2_t = type { [2 x <1 x i64>] }
  13 %struct.float32x2x2_t = type { [2 x <2 x float>] }
  14 %struct.float64x1x2_t = type { [2 x <1 x double>] }
  15 %struct.int8x16x3_t = type { [3 x <16 x i8>] }
  16 %struct.int16x8x3_t = type { [3 x <8 x i16>] }
  17 %struct.int32x4x3_t = type { [3 x <4 x i32>] }
  18 %struct.int64x2x3_t = type { [3 x <2 x i64>] }
  19 %struct.float32x4x3_t = type { [3 x <4 x float>] }
  20 %struct.float64x2x3_t = type { [3 x <2 x double>] }
  21 %struct.int8x8x3_t = type { [3 x <8 x i8>] }
  22 %struct.int16x4x3_t = type { [3 x <4 x i16>] }
  23 %struct.int32x2x3_t = type { [3 x <2 x i32>] }
  24 %struct.int64x1x3_t = type { [3 x <1 x i64>] }
  25 %struct.float32x2x3_t = type { [3 x <2 x float>] }
  26 %struct.float64x1x3_t = type { [3 x <1 x double>] }
  27 %struct.int8x16x4_t = type { [4 x <16 x i8>] }
  28 %struct.int16x8x4_t = type { [4 x <8 x i16>] }
  29 %struct.int32x4x4_t = type { [4 x <4 x i32>] }
  30 %struct.int64x2x4_t = type { [4 x <2 x i64>] }
  31 %struct.float32x4x4_t = type { [4 x <4 x float>] }
  32 %struct.float64x2x4_t = type { [4 x <2 x double>] }
  33 %struct.int8x8x4_t = type { [4 x <8 x i8>] }
  34 %struct.int16x4x4_t = type { [4 x <4 x i16>] }
  35 %struct.int32x2x4_t = type { [4 x <2 x i32>] }
  36 %struct.int64x1x4_t = type { [4 x <1 x i64>] }
  37 %struct.float32x2x4_t = type { [4 x <2 x float>] }
  38 %struct.float64x1x4_t = type { [4 x <1 x double>] }
  39
  40 define <16 x i8> @test_vld1q_dup_s8(i8* %a) {
  41 ; CHECK-LABEL: test_vld1q_dup_s8
  42 ; CHECK: ld1r {{{v[0-9]+}}.16b}, [x0]
  43 entry:
  44   %0 = load i8* %a, align 1
  45   %1 = insertelement <16 x i8> undef, i8 %0, i32 0
  46   %lane = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
  47   ret <16 x i8> %lane
  48 }
  49
  50 define <8 x i16> @test_vld1q_dup_s16(i16* %a) {
  51 ; CHECK-LABEL: test_vld1q_dup_s16
  52 ; CHECK: ld1r {{{v[0-9]+}}.8h}, [x0]
  53 entry:
  54   %0 = load i16* %a, align 2
  55   %1 = insertelement <8 x i16> undef, i16 %0, i32 0
  56   %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
  57   ret <8 x i16> %lane
  58 }
  59
  60 define <4 x i32> @test_vld1q_dup_s32(i32* %a) {
  61 ; CHECK-LABEL: test_vld1q_dup_s32
  62 ; CHECK: ld1r {{{v[0-9]+}}.4s}, [x0]
  63 entry:
  64   %0 = load i32* %a, align 4
  65   %1 = insertelement <4 x i32> undef, i32 %0, i32 0
  66   %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
  67   ret <4 x i32> %lane
  68 }
  69
  70 define <2 x i64> @test_vld1q_dup_s64(i64* %a) {
  71 ; CHECK-LABEL: test_vld1q_dup_s64
  72 ; CHECK: ld1r {{{v[0-9]+}}.2d}, [x0]
  73 entry:
  74   %0 = load i64* %a, align 8
  75   %1 = insertelement <2 x i64> undef, i64 %0, i32 0
  76   %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
  77   ret <2 x i64> %lane
  78 }
  79
  80 define <4 x float> @test_vld1q_dup_f32(float* %a) {
  81 ; CHECK-LABEL: test_vld1q_dup_f32
  82 ; CHECK: ld1r {{{v[0-9]+}}.4s}, [x0]
  83 entry:
  84   %0 = load float* %a, align 4
  85   %1 = insertelement <4 x float> undef, float %0, i32 0
  86   %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
  87   ret <4 x float> %lane
  88 }
  89
  90 define <2 x double> @test_vld1q_dup_f64(double* %a) {
  91 ; CHECK-LABEL: test_vld1q_dup_f64
  92 ; CHECK: ld1r {{{v[0-9]+}}.2d}, [x0]
  93 entry:
  94   %0 = load double* %a, align 8
  95   %1 = insertelement <2 x double> undef, double %0, i32 0
  96   %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
  97   ret <2 x double> %lane
  98 }
  99
 100 define <8 x i8> @test_vld1_dup_s8(i8* %a) {
 101 ; CHECK-LABEL: test_vld1_dup_s8
 102 ; CHECK: ld1r {{{v[0-9]+}}.8b}, [x0]
 103 entry:
 104   %0 = load i8* %a, align 1
 105   %1 = insertelement <8 x i8> undef, i8 %0, i32 0
 106   %lane = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
 107   ret <8 x i8> %lane
 108 }
 109
 110 define <4 x i16> @test_vld1_dup_s16(i16* %a) {
 111 ; CHECK-LABEL: test_vld1_dup_s16
 112 ; CHECK: ld1r {{{v[0-9]+}}.4h}, [x0]
 113 entry:
 114   %0 = load i16* %a, align 2
 115   %1 = insertelement <4 x i16> undef, i16 %0, i32 0
 116   %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
 117   ret <4 x i16> %lane
 118 }
 119
 120 define <2 x i32> @test_vld1_dup_s32(i32* %a) {
 121 ; CHECK-LABEL: test_vld1_dup_s32
 122 ; CHECK: ld1r {{{v[0-9]+}}.2s}, [x0]
 123 entry:
 124   %0 = load i32* %a, align 4
 125   %1 = insertelement <2 x i32> undef, i32 %0, i32 0
 126   %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
 127   ret <2 x i32> %lane
 128 }
 129
 130 define <1 x i64> @test_vld1_dup_s64(i64* %a) {
 131 ; CHECK-LABEL: test_vld1_dup_s64
 132 ; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
 133 entry:
 134   %0 = load i64* %a, align 8
 135   %1 = insertelement <1 x i64> undef, i64 %0, i32 0
 136   ret <1 x i64> %1
 137 }
 138
 139 define <2 x float> @test_vld1_dup_f32(float* %a) {
 140 ; CHECK-LABEL: test_vld1_dup_f32
 141 ; CHECK: ld1r {{{v[0-9]+}}.2s}, [x0]
 142 entry:
 143   %0 = load float* %a, align 4
 144   %1 = insertelement <2 x float> undef, float %0, i32 0
 145   %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
 146   ret <2 x float> %lane
 147 }
 148
 149 define <1 x double> @test_vld1_dup_f64(double* %a) {
 150 ; CHECK-LABEL: test_vld1_dup_f64
 151 ; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
 152 entry:
 153   %0 = load double* %a, align 8
 154   %1 = insertelement <1 x double> undef, double %0, i32 0
 155   ret <1 x double> %1
 156 }
 157
 158 define %struct.int8x16x2_t @test_vld2q_dup_s8(i8* %a) {
 159 ; CHECK-LABEL: test_vld2q_dup_s8
 160 ; CHECK: ld2r {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, [x0]
 161 entry:
 162   %vld_dup = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
 163   %0 = extractvalue { <16 x i8>, <16 x i8> } %vld_dup, 0
 164   %lane = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
 165   %1 = extractvalue { <16 x i8>, <16 x i8> } %vld_dup, 1
 166   %lane1 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
 167   %.fca.0.0.insert = insertvalue %struct.int8x16x2_t undef, <16 x i8> %lane, 0, 0
 168   %.fca.0.1.insert = insertvalue %struct.int8x16x2_t %.fca.0.0.insert, <16 x i8> %lane1, 0, 1
 169   ret %struct.int8x16x2_t %.fca.0.1.insert
 170 }
 171
 172 define %struct.int16x8x2_t @test_vld2q_dup_s16(i16* %a) {
 173 ; CHECK-LABEL: test_vld2q_dup_s16
 174 ; CHECK: ld2r {{{v[0-9]+}}.8h, {{v[0-9]+}}.8h}, [x0]
 175 entry:
 176   %0 = bitcast i16* %a to i8*
 177   %vld_dup = tail call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16(i8* %0, <8 x i16> undef, <8 x i16> undef, i32 0, i32 2)
 178   %1 = extractvalue { <8 x i16>, <8 x i16> } %vld_dup, 0
 179   %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
 180   %2 = extractvalue { <8 x i16>, <8 x i16> } %vld_dup, 1
 181   %lane1 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer
 182   %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %lane, 0, 0
 183   %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %lane1, 0, 1
 184   ret %struct.int16x8x2_t %.fca.0.1.insert
 185 }
 186
 187 define %struct.int32x4x2_t @test_vld2q_dup_s32(i32* %a) {
 188 ; CHECK-LABEL: test_vld2q_dup_s32
 189 ; CHECK: ld2r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
 190 entry:
 191   %0 = bitcast i32* %a to i8*
 192   %vld_dup = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %0, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
 193   %1 = extractvalue { <4 x i32>, <4 x i32> } %vld_dup, 0
 194   %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
 195   %2 = extractvalue { <4 x i32>, <4 x i32> } %vld_dup, 1
 196   %lane1 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
 197   %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %lane, 0, 0
 198   %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %lane1, 0, 1
 199   ret %struct.int32x4x2_t %.fca.0.1.insert
 200 }
 201
 202 define %struct.int64x2x2_t @test_vld2q_dup_s64(i64* %a) {
 203 ; CHECK-LABEL: test_vld2q_dup_s64
 204 ; CHECK: ld2r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
 205 entry:
 206   %0 = bitcast i64* %a to i8*
 207   %vld_dup = tail call { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2lane.v2i64(i8* %0, <2 x i64> undef, <2 x i64> undef, i32 0, i32 8)
 208   %1 = extractvalue { <2 x i64>, <2 x i64> } %vld_dup, 0
 209   %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
 210   %2 = extractvalue { <2 x i64>, <2 x i64> } %vld_dup, 1
 211   %lane1 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer
 212   %.fca.0.0.insert = insertvalue %struct.int64x2x2_t undef, <2 x i64> %lane, 0, 0
 213   %.fca.0.1.insert = insertvalue %struct.int64x2x2_t %.fca.0.0.insert, <2 x i64> %lane1, 0, 1
 214   ret %struct.int64x2x2_t %.fca.0.1.insert
 215 }
 216
 217 define %struct.float32x4x2_t @test_vld2q_dup_f32(float* %a) {
 218 ; CHECK-LABEL: test_vld2q_dup_f32
 219 ; CHECK: ld2r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
 220 entry:
 221   %0 = bitcast float* %a to i8*
 222   %vld_dup = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32(i8* %0, <4 x float> undef, <4 x float> undef, i32 0, i32 4)
 223   %1 = extractvalue { <4 x float>, <4 x float> } %vld_dup, 0
 224   %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
 225   %2 = extractvalue { <4 x float>, <4 x float> } %vld_dup, 1
 226   %lane1 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer
 227   %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %lane, 0, 0
 228   %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %lane1, 0, 1
 229   ret %struct.float32x4x2_t %.fca.0.1.insert
 230 }
 231
 232 define %struct.float64x2x2_t @test_vld2q_dup_f64(double* %a) {
 233 ; CHECK-LABEL: test_vld2q_dup_f64
 234 ; CHECK: ld2r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
 235 entry:
 236   %0 = bitcast double* %a to i8*
 237   %vld_dup = tail call { <2 x double>, <2 x double> } @llvm.arm.neon.vld2lane.v2f64(i8* %0, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
 238   %1 = extractvalue { <2 x double>, <2 x double> } %vld_dup, 0
 239   %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
 240   %2 = extractvalue { <2 x double>, <2 x double> } %vld_dup, 1
 241   %lane1 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
 242   %.fca.0.0.insert = insertvalue %struct.float64x2x2_t undef, <2 x double> %lane, 0, 0
 243   %.fca.0.1.insert = insertvalue %struct.float64x2x2_t %.fca.0.0.insert, <2 x double> %lane1, 0, 1
 244   ret %struct.float64x2x2_t %.fca.0.1.insert
 245 }
 246
 247 define %struct.int8x8x2_t @test_vld2_dup_s8(i8* %a) {
 248 ; CHECK-LABEL: test_vld2_dup_s8
 249 ; CHECK: ld2r {{{v[0-9]+}}.8b, {{v[0-9]+}}.8b}, [x0]
 250 entry:
 251   %vld_dup = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
 252   %0 = extractvalue { <8 x i8>, <8 x i8> } %vld_dup, 0
 253   %lane = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> zeroinitializer
 254   %1 = extractvalue { <8 x i8>, <8 x i8> } %vld_dup, 1
 255   %lane1 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
 256   %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %lane, 0, 0
 257   %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %lane1, 0, 1
 258   ret %struct.int8x8x2_t %.fca.0.1.insert
 259 }
 260
 261 define %struct.int16x4x2_t @test_vld2_dup_s16(i16* %a) {
 262 ; CHECK-LABEL: test_vld2_dup_s16
 263 ; CHECK: ld2r {{{v[0-9]+}}.4h, {{v[0-9]+}}.4h}, [x0]
 264 entry:
 265   %0 = bitcast i16* %a to i8*
 266   %vld_dup = tail call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16(i8* %0, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
 267   %1 = extractvalue { <4 x i16>, <4 x i16> } %vld_dup, 0
 268   %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
 269   %2 = extractvalue { <4 x i16>, <4 x i16> } %vld_dup, 1
 270   %lane1 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> zeroinitializer
 271   %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %lane, 0, 0
 272   %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %lane1, 0, 1
 273   ret %struct.int16x4x2_t %.fca.0.1.insert
 274 }
 275
 276 define %struct.int32x2x2_t @test_vld2_dup_s32(i32* %a) {
 277 ; CHECK-LABEL: test_vld2_dup_s32
 278 ; CHECK: ld2r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
 279 entry:
 280   %0 = bitcast i32* %a to i8*
 281   %vld_dup = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32(i8* %0, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
 282   %1 = extractvalue { <2 x i32>, <2 x i32> } %vld_dup, 0
 283   %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
 284   %2 = extractvalue { <2 x i32>, <2 x i32> } %vld_dup, 1
 285   %lane1 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer
 286   %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %lane, 0, 0
 287   %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %lane1, 0, 1
 288   ret %struct.int32x2x2_t %.fca.0.1.insert
 289 }
 290
 291 define %struct.int64x1x2_t @test_vld2_dup_s64(i64* %a) {
 292 ; CHECK-LABEL: test_vld2_dup_s64
 293 ; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
 294 entry:
 295   %0 = bitcast i64* %a to i8*
 296   %vld_dup = tail call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8* %0, i32 8)
 297   %vld_dup.fca.0.extract = extractvalue { <1 x i64>, <1 x i64> } %vld_dup, 0
 298   %vld_dup.fca.1.extract = extractvalue { <1 x i64>, <1 x i64> } %vld_dup, 1
 299   %.fca.0.0.insert = insertvalue %struct.int64x1x2_t undef, <1 x i64> %vld_dup.fca.0.extract, 0, 0
 300   %.fca.0.1.insert = insertvalue %struct.int64x1x2_t %.fca.0.0.insert, <1 x i64> %vld_dup.fca.1.extract, 0, 1
 301   ret %struct.int64x1x2_t %.fca.0.1.insert
 302 }
 303
 304 define %struct.float32x2x2_t @test_vld2_dup_f32(float* %a) {
 305 ; CHECK-LABEL: test_vld2_dup_f32
 306 ; CHECK: ld2r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
 307 entry:
 308   %0 = bitcast float* %a to i8*
 309   %vld_dup = tail call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32(i8* %0, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
 310   %1 = extractvalue { <2 x float>, <2 x float> } %vld_dup, 0
 311   %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
 312   %2 = extractvalue { <2 x float>, <2 x float> } %vld_dup, 1
 313   %lane1 = shufflevector <2 x float> %2, <2 x float> undef, <2 x i32> zeroinitializer
 314   %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %lane, 0, 0
 315   %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %lane1, 0, 1
 316   ret %struct.float32x2x2_t %.fca.0.1.insert
 317 }
 318
 319 define %struct.float64x1x2_t @test_vld2_dup_f64(double* %a) {
 320 ; CHECK-LABEL: test_vld2_dup_f64
 321 ; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
 322 entry:
 323   %0 = bitcast double* %a to i8*
 324   %vld_dup = tail call { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8* %0, i32 8)
 325   %vld_dup.fca.0.extract = extractvalue { <1 x double>, <1 x double> } %vld_dup, 0
 326   %vld_dup.fca.1.extract = extractvalue { <1 x double>, <1 x double> } %vld_dup, 1
 327   %.fca.0.0.insert = insertvalue %struct.float64x1x2_t undef, <1 x double> %vld_dup.fca.0.extract, 0, 0
 328   %.fca.0.1.insert = insertvalue %struct.float64x1x2_t %.fca.0.0.insert, <1 x double> %vld_dup.fca.1.extract, 0, 1
 329   ret %struct.float64x1x2_t %.fca.0.1.insert
 330 }
 331
 332 define %struct.int8x16x3_t @test_vld3q_dup_s8(i8* %a) {
 333 ; CHECK-LABEL: test_vld3q_dup_s8
 334 ; CHECK: ld3r {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, [x0]
 335 entry:
 336   %vld_dup = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
 337   %0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 0
 338   %lane = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
 339   %1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 1
 340   %lane1 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
 341   %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 2
 342   %lane2 = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
 343   %.fca.0.0.insert = insertvalue %struct.int8x16x3_t undef, <16 x i8> %lane, 0, 0
 344   %.fca.0.1.insert = insertvalue %struct.int8x16x3_t %.fca.0.0.insert, <16 x i8> %lane1, 0, 1
 345   %.fca.0.2.insert = insertvalue %struct.int8x16x3_t %.fca.0.1.insert, <16 x i8> %lane2, 0, 2
 346   ret %struct.int8x16x3_t %.fca.0.2.insert
 347 }
 348
 349 define %struct.int16x8x3_t @test_vld3q_dup_s16(i16* %a) {
 350 ; CHECK-LABEL: test_vld3q_dup_s16
 351 ; CHECK: ld3r {{{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h}, [x0]
 352 entry:
 353   %0 = bitcast i16* %a to i8*
 354   %vld_dup = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8* %0, <8 x i16> undef, <8 x i16> undef, <8 x i16> undef, i32 0, i32 2)
 355   %1 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 0
 356   %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
 357   %2 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 1
 358   %lane1 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer
 359   %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 2
 360   %lane2 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> zeroinitializer
 361   %.fca.0.0.insert = insertvalue %struct.int16x8x3_t undef, <8 x i16> %lane, 0, 0
 362   %.fca.0.1.insert = insertvalue %struct.int16x8x3_t %.fca.0.0.insert, <8 x i16> %lane1, 0, 1
 363   %.fca.0.2.insert = insertvalue %struct.int16x8x3_t %.fca.0.1.insert, <8 x i16> %lane2, 0, 2
 364   ret %struct.int16x8x3_t %.fca.0.2.insert
 365 }
 366
 367 define %struct.int32x4x3_t @test_vld3q_dup_s32(i32* %a) {
 368 ; CHECK-LABEL: test_vld3q_dup_s32
 369 ; CHECK: ld3r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
 370 entry:
 371   %0 = bitcast i32* %a to i8*
 372   %vld_dup = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8* %0, <4 x i32> undef, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
 373   %1 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 0
 374   %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
 375   %2 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 1
 376   %lane1 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
 377   %3 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 2
 378   %lane2 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer
 379   %.fca.0.0.insert = insertvalue %struct.int32x4x3_t undef, <4 x i32> %lane, 0, 0
 380   %.fca.0.1.insert = insertvalue %struct.int32x4x3_t %.fca.0.0.insert, <4 x i32> %lane1, 0, 1
 381   %.fca.0.2.insert = insertvalue %struct.int32x4x3_t %.fca.0.1.insert, <4 x i32> %lane2, 0, 2
 382   ret %struct.int32x4x3_t %.fca.0.2.insert
 383 }
 384
 385 define %struct.int64x2x3_t @test_vld3q_dup_s64(i64* %a) {
 386 ; CHECK-LABEL: test_vld3q_dup_s64
 387 ; CHECK: ld3r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
 388 entry:
 389   %0 = bitcast i64* %a to i8*
 390   %vld_dup = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3lane.v2i64(i8* %0, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, i32 0, i32 8)
 391   %1 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 0
 392   %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
 393   %2 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 1
 394   %lane1 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer
 395   %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 2
 396   %lane2 = shufflevector <2 x i64> %3, <2 x i64> undef, <2 x i32> zeroinitializer
 397   %.fca.0.0.insert = insertvalue %struct.int64x2x3_t undef, <2 x i64> %lane, 0, 0
 398   %.fca.0.1.insert = insertvalue %struct.int64x2x3_t %.fca.0.0.insert, <2 x i64> %lane1, 0, 1
 399   %.fca.0.2.insert = insertvalue %struct.int64x2x3_t %.fca.0.1.insert, <2 x i64> %lane2, 0, 2
 400   ret %struct.int64x2x3_t %.fca.0.2.insert
 401 }
 402
 403 define %struct.float32x4x3_t @test_vld3q_dup_f32(float* %a) {
 404 ; CHECK-LABEL: test_vld3q_dup_f32
 405 ; CHECK: ld3r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
 406 entry:
 407   %0 = bitcast float* %a to i8*
 408   %vld_dup = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32(i8* %0, <4 x float> undef, <4 x float> undef, <4 x float> undef, i32 0, i32 4)
 409   %1 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld_dup, 0
 410   %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
 411   %2 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld_dup, 1
 412   %lane1 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer
 413   %3 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld_dup, 2
 414   %lane2 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> zeroinitializer
 415   %.fca.0.0.insert = insertvalue %struct.float32x4x3_t undef, <4 x float> %lane, 0, 0
 416   %.fca.0.1.insert = insertvalue %struct.float32x4x3_t %.fca.0.0.insert, <4 x float> %lane1, 0, 1
 417   %.fca.0.2.insert = insertvalue %struct.float32x4x3_t %.fca.0.1.insert, <4 x float> %lane2, 0, 2
 418   ret %struct.float32x4x3_t %.fca.0.2.insert
 419 }
 420
 421 define %struct.float64x2x3_t @test_vld3q_dup_f64(double* %a) {
 422 ; CHECK-LABEL: test_vld3q_dup_f64
 423 ; CHECK: ld3r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
 424 entry:
 425   %0 = bitcast double* %a to i8*
 426   %vld_dup = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3lane.v2f64(i8* %0, <2 x double> undef, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
 427   %1 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld_dup, 0
 428   %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
 429   %2 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld_dup, 1
 430   %lane1 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
 431   %3 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld_dup, 2
 432   %lane2 = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> zeroinitializer
 433   %.fca.0.0.insert = insertvalue %struct.float64x2x3_t undef, <2 x double> %lane, 0, 0
 434   %.fca.0.1.insert = insertvalue %struct.float64x2x3_t %.fca.0.0.insert, <2 x double> %lane1, 0, 1
 435   %.fca.0.2.insert = insertvalue %struct.float64x2x3_t %.fca.0.1.insert, <2 x double> %lane2, 0, 2
 436   ret %struct.float64x2x3_t %.fca.0.2.insert
 437 }
 438
 439 define %struct.int8x8x3_t @test_vld3_dup_s8(i8* %a) {
 440 ; CHECK-LABEL: test_vld3_dup_s8
 441 ; CHECK: ld3r {{{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b}, [x0]
 442 entry:
 443   %vld_dup = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
 444   %0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 0
 445   %lane = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> zeroinitializer
 446   %1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 1
 447   %lane1 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
 448   %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 2
 449   %lane2 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
 450   %.fca.0.0.insert = insertvalue %struct.int8x8x3_t undef, <8 x i8> %lane, 0, 0
 451   %.fca.0.1.insert = insertvalue %struct.int8x8x3_t %.fca.0.0.insert, <8 x i8> %lane1, 0, 1
 452   %.fca.0.2.insert = insertvalue %struct.int8x8x3_t %.fca.0.1.insert, <8 x i8> %lane2, 0, 2
 453   ret %struct.int8x8x3_t %.fca.0.2.insert
 454 }
 455
 456 define %struct.int16x4x3_t @test_vld3_dup_s16(i16* %a) {
 457 ; CHECK-LABEL: test_vld3_dup_s16
 458 ; CHECK: ld3r {{{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h}, [x0]
 459 entry:
 460   %0 = bitcast i16* %a to i8*
 461   %vld_dup = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %0, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
 462   %1 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 0
 463   %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
 464   %2 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 1
 465   %lane1 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> zeroinitializer
 466   %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 2
 467   %lane2 = shufflevector <4 x i16> %3, <4 x i16> undef, <4 x i32> zeroinitializer
 468   %.fca.0.0.insert = insertvalue %struct.int16x4x3_t undef, <4 x i16> %lane, 0, 0
 469   %.fca.0.1.insert = insertvalue %struct.int16x4x3_t %.fca.0.0.insert, <4 x i16> %lane1, 0, 1
 470   %.fca.0.2.insert = insertvalue %struct.int16x4x3_t %.fca.0.1.insert, <4 x i16> %lane2, 0, 2
 471   ret %struct.int16x4x3_t %.fca.0.2.insert
 472 }
 473
 474 define %struct.int32x2x3_t @test_vld3_dup_s32(i32* %a) {
 475 ; CHECK-LABEL: test_vld3_dup_s32
 476 ; CHECK: ld3r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
 477 entry:
 478   %0 = bitcast i32* %a to i8*
 479   %vld_dup = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32(i8* %0, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
 480   %1 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 0
 481   %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
 482   %2 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 1
 483   %lane1 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer
 484   %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 2
 485   %lane2 = shufflevector <2 x i32> %3, <2 x i32> undef, <2 x i32> zeroinitializer
 486   %.fca.0.0.insert = insertvalue %struct.int32x2x3_t undef, <2 x i32> %lane, 0, 0
 487   %.fca.0.1.insert = insertvalue %struct.int32x2x3_t %.fca.0.0.insert, <2 x i32> %lane1, 0, 1
 488   %.fca.0.2.insert = insertvalue %struct.int32x2x3_t %.fca.0.1.insert, <2 x i32> %lane2, 0, 2
 489   ret %struct.int32x2x3_t %.fca.0.2.insert
 490 }
 491
 492 define %struct.int64x1x3_t @test_vld3_dup_s64(i64* %a) {
 493 ; CHECK-LABEL: test_vld3_dup_s64
 494 ; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
 495 entry:
 496   %0 = bitcast i64* %a to i8*
 497   %vld_dup = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8* %0, i32 8)
 498   %vld_dup.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 0
 499   %vld_dup.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 1
 500   %vld_dup.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 2
 501   %.fca.0.0.insert = insertvalue %struct.int64x1x3_t undef, <1 x i64> %vld_dup.fca.0.extract, 0, 0
 502   %.fca.0.1.insert = insertvalue %struct.int64x1x3_t %.fca.0.0.insert, <1 x i64> %vld_dup.fca.1.extract, 0, 1
 503   %.fca.0.2.insert = insertvalue %struct.int64x1x3_t %.fca.0.1.insert, <1 x i64> %vld_dup.fca.2.extract, 0, 2
 504   ret %struct.int64x1x3_t %.fca.0.2.insert
 505 }
 506
 507 define %struct.float32x2x3_t @test_vld3_dup_f32(float* %a) {
 508 ; CHECK-LABEL: test_vld3_dup_f32
 509 ; CHECK: ld3r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
 510 entry:
 511   %0 = bitcast float* %a to i8*
 512   %vld_dup = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8* %0, <2 x float> undef, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
 513   %1 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld_dup, 0
 514   %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
 515   %2 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld_dup, 1
 516   %lane1 = shufflevector <2 x float> %2, <2 x float> undef, <2 x i32> zeroinitializer
 517   %3 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld_dup, 2
 518   %lane2 = shufflevector <2 x float> %3, <2 x float> undef, <2 x i32> zeroinitializer
 519   %.fca.0.0.insert = insertvalue %struct.float32x2x3_t undef, <2 x float> %lane, 0, 0
 520   %.fca.0.1.insert = insertvalue %struct.float32x2x3_t %.fca.0.0.insert, <2 x float> %lane1, 0, 1
 521   %.fca.0.2.insert = insertvalue %struct.float32x2x3_t %.fca.0.1.insert, <2 x float> %lane2, 0, 2
 522   ret %struct.float32x2x3_t %.fca.0.2.insert
 523 }
 524
 525 define %struct.float64x1x3_t @test_vld3_dup_f64(double* %a) {
 526 ; CHECK-LABEL: test_vld3_dup_f64
 527 ; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
 528 entry:
 529   %0 = bitcast double* %a to i8*
 530   %vld_dup = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8* %0, i32 8)
 531   %vld_dup.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld_dup, 0
 532   %vld_dup.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld_dup, 1
 533   %vld_dup.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld_dup, 2
 534   %.fca.0.0.insert = insertvalue %struct.float64x1x3_t undef, <1 x double> %vld_dup.fca.0.extract, 0, 0
 535   %.fca.0.1.insert = insertvalue %struct.float64x1x3_t %.fca.0.0.insert, <1 x double> %vld_dup.fca.1.extract, 0, 1
 536   %.fca.0.2.insert = insertvalue %struct.float64x1x3_t %.fca.0.1.insert, <1 x double> %vld_dup.fca.2.extract, 0, 2
 537   ret %struct.float64x1x3_t %.fca.0.2.insert
 538 }
 539
 540 define %struct.int8x16x4_t @test_vld4q_dup_s8(i8* %a) {
 541 ; CHECK-LABEL: test_vld4q_dup_s8
 542 ; CHECK: ld4r {{{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b}, [x0]
 543 entry:
 544   %vld_dup = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
 545   %0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 0
 546   %lane = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
 547   %1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 1
 548   %lane1 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
 549   %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 2
 550   %lane2 = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
 551   %3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 3
 552   %lane3 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> zeroinitializer
 553   %.fca.0.0.insert = insertvalue %struct.int8x16x4_t undef, <16 x i8> %lane, 0, 0
 554   %.fca.0.1.insert = insertvalue %struct.int8x16x4_t %.fca.0.0.insert, <16 x i8> %lane1, 0, 1
 555   %.fca.0.2.insert = insertvalue %struct.int8x16x4_t %.fca.0.1.insert, <16 x i8> %lane2, 0, 2
 556   %.fca.0.3.insert = insertvalue %struct.int8x16x4_t %.fca.0.2.insert, <16 x i8> %lane3, 0, 3
 557   ret %struct.int8x16x4_t %.fca.0.3.insert
 558 }
 559
 560 define %struct.int16x8x4_t @test_vld4q_dup_s16(i16* %a) {
 561 ; CHECK-LABEL: test_vld4q_dup_s16
 562 ; CHECK: ld4r {{{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h}, [x0]
 563 entry:
 564   %0 = bitcast i16* %a to i8*
 565   %vld_dup = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16(i8* %0, <8 x i16> undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> undef, i32 0, i32 2)
 566   %1 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 0
 567   %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
 568   %2 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 1
 569   %lane1 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer
 570   %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 2
 571   %lane2 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> zeroinitializer
 572   %4 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 3
 573   %lane3 = shufflevector <8 x i16> %4, <8 x i16> undef, <8 x i32> zeroinitializer
 574   %.fca.0.0.insert = insertvalue %struct.int16x8x4_t undef, <8 x i16> %lane, 0, 0
 575   %.fca.0.1.insert = insertvalue %struct.int16x8x4_t %.fca.0.0.insert, <8 x i16> %lane1, 0, 1
 576   %.fca.0.2.insert = insertvalue %struct.int16x8x4_t %.fca.0.1.insert, <8 x i16> %lane2, 0, 2
 577   %.fca.0.3.insert = insertvalue %struct.int16x8x4_t %.fca.0.2.insert, <8 x i16> %lane3, 0, 3
 578   ret %struct.int16x8x4_t %.fca.0.3.insert
 579 }
 580
 581 define %struct.int32x4x4_t @test_vld4q_dup_s32(i32* %a) {
 582 ; CHECK-LABEL: test_vld4q_dup_s32
 583 ; CHECK: ld4r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
 584 entry:
 585   %0 = bitcast i32* %a to i8*
 586   %vld_dup = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8* %0, <4 x i32> undef, <4 x i32> undef, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
 587   %1 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 0
 588   %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
 589   %2 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 1
 590   %lane1 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
 591   %3 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 2
 592   %lane2 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer
 593   %4 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 3
 594   %lane3 = shufflevector <4 x i32> %4, <4 x i32> undef, <4 x i32> zeroinitializer
 595   %.fca.0.0.insert = insertvalue %struct.int32x4x4_t undef, <4 x i32> %lane, 0, 0
 596   %.fca.0.1.insert = insertvalue %struct.int32x4x4_t %.fca.0.0.insert, <4 x i32> %lane1, 0, 1
 597   %.fca.0.2.insert = insertvalue %struct.int32x4x4_t %.fca.0.1.insert, <4 x i32> %lane2, 0, 2
 598   %.fca.0.3.insert = insertvalue %struct.int32x4x4_t %.fca.0.2.insert, <4 x i32> %lane3, 0, 3
 599   ret %struct.int32x4x4_t %.fca.0.3.insert
 600 }
 601
 602 define %struct.int64x2x4_t @test_vld4q_dup_s64(i64* %a) {
 603 ; CHECK-LABEL: test_vld4q_dup_s64
 604 ; CHECK: ld4r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
 605 entry:
 606   %0 = bitcast i64* %a to i8*
 607   %vld_dup = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4lane.v2i64(i8* %0, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, i32 0, i32 8)
 608   %1 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 0
 609   %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
 610   %2 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 1
 611   %lane1 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer
 612   %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 2
 613   %lane2 = shufflevector <2 x i64> %3, <2 x i64> undef, <2 x i32> zeroinitializer
 614   %4 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 3
 615   %lane3 = shufflevector <2 x i64> %4, <2 x i64> undef, <2 x i32> zeroinitializer
 616   %.fca.0.0.insert = insertvalue %struct.int64x2x4_t undef, <2 x i64> %lane, 0, 0
 617   %.fca.0.1.insert = insertvalue %struct.int64x2x4_t %.fca.0.0.insert, <2 x i64> %lane1, 0, 1
 618   %.fca.0.2.insert = insertvalue %struct.int64x2x4_t %.fca.0.1.insert, <2 x i64> %lane2, 0, 2
 619   %.fca.0.3.insert = insertvalue %struct.int64x2x4_t %.fca.0.2.insert, <2 x i64> %lane3, 0, 3
 620   ret %struct.int64x2x4_t %.fca.0.3.insert
 621 }
 622
 623 define %struct.float32x4x4_t @test_vld4q_dup_f32(float* %a) {
 624 ; CHECK-LABEL: test_vld4q_dup_f32
 625 ; CHECK: ld4r {{{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s}, [x0]
 626 entry:
 627   %0 = bitcast float* %a to i8*
 628   %vld_dup = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32(i8* %0, <4 x float> undef, <4 x float> undef, <4 x float> undef, <4 x float> undef, i32 0, i32 4)
 629   %1 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 0
 630   %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
 631   %2 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 1
 632   %lane1 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer
 633   %3 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 2
 634   %lane2 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> zeroinitializer
 635   %4 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 3
 636   %lane3 = shufflevector <4 x float> %4, <4 x float> undef, <4 x i32> zeroinitializer
 637   %.fca.0.0.insert = insertvalue %struct.float32x4x4_t undef, <4 x float> %lane, 0, 0
 638   %.fca.0.1.insert = insertvalue %struct.float32x4x4_t %.fca.0.0.insert, <4 x float> %lane1, 0, 1
 639   %.fca.0.2.insert = insertvalue %struct.float32x4x4_t %.fca.0.1.insert, <4 x float> %lane2, 0, 2
 640   %.fca.0.3.insert = insertvalue %struct.float32x4x4_t %.fca.0.2.insert, <4 x float> %lane3, 0, 3
 641   ret %struct.float32x4x4_t %.fca.0.3.insert
 642 }
 643
 644 define %struct.float64x2x4_t @test_vld4q_dup_f64(double* %a) {
 645 ; CHECK-LABEL: test_vld4q_dup_f64
 646 ; CHECK: ld4r {{{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d}, [x0]
 647 entry:
 648   %0 = bitcast double* %a to i8*
 649   %vld_dup = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %0, <2 x double> undef, <2 x double> undef, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
 650   %1 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 0
 651   %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
 652   %2 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 1
 653   %lane1 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
 654   %3 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 2
 655   %lane2 = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> zeroinitializer
 656   %4 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 3
 657   %lane3 = shufflevector <2 x double> %4, <2 x double> undef, <2 x i32> zeroinitializer
 658   %.fca.0.0.insert = insertvalue %struct.float64x2x4_t undef, <2 x double> %lane, 0, 0
 659   %.fca.0.1.insert = insertvalue %struct.float64x2x4_t %.fca.0.0.insert, <2 x double> %lane1, 0, 1
 660   %.fca.0.2.insert = insertvalue %struct.float64x2x4_t %.fca.0.1.insert, <2 x double> %lane2, 0, 2
 661   %.fca.0.3.insert = insertvalue %struct.float64x2x4_t %.fca.0.2.insert, <2 x double> %lane3, 0, 3
 662   ret %struct.float64x2x4_t %.fca.0.3.insert
 663 }
 664
 665 define %struct.int8x8x4_t @test_vld4_dup_s8(i8* %a) {
 666 ; CHECK-LABEL: test_vld4_dup_s8
 667 ; CHECK: ld4r {{{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b}, [x0]
 668 entry:
 669   %vld_dup = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
 670   %0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 0
 671   %lane = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> zeroinitializer
 672   %1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 1
 673   %lane1 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
 674   %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 2
 675   %lane2 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
 676   %3 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 3
 677   %lane3 = shufflevector <8 x i8> %3, <8 x i8> undef, <8 x i32> zeroinitializer
 678   %.fca.0.0.insert = insertvalue %struct.int8x8x4_t undef, <8 x i8> %lane, 0, 0
 679   %.fca.0.1.insert = insertvalue %struct.int8x8x4_t %.fca.0.0.insert, <8 x i8> %lane1, 0, 1
 680   %.fca.0.2.insert = insertvalue %struct.int8x8x4_t %.fca.0.1.insert, <8 x i8> %lane2, 0, 2
 681   %.fca.0.3.insert = insertvalue %struct.int8x8x4_t %.fca.0.2.insert, <8 x i8> %lane3, 0, 3
 682   ret %struct.int8x8x4_t %.fca.0.3.insert
 683 }
 684
 685 define %struct.int16x4x4_t @test_vld4_dup_s16(i16* %a) {
 686 ; CHECK-LABEL: test_vld4_dup_s16
 687 ; CHECK: ld4r {{{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h}, [x0]
 688 entry:
 689   %0 = bitcast i16* %a to i8*
 690   %vld_dup = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16(i8* %0, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
 691   %1 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 0
 692   %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
 693   %2 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 1
 694   %lane1 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> zeroinitializer
 695   %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 2
 696   %lane2 = shufflevector <4 x i16> %3, <4 x i16> undef, <4 x i32> zeroinitializer
 697   %4 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 3
 698   %lane3 = shufflevector <4 x i16> %4, <4 x i16> undef, <4 x i32> zeroinitializer
 699   %.fca.0.0.insert = insertvalue %struct.int16x4x4_t undef, <4 x i16> %lane, 0, 0
 700   %.fca.0.1.insert = insertvalue %struct.int16x4x4_t %.fca.0.0.insert, <4 x i16> %lane1, 0, 1
 701   %.fca.0.2.insert = insertvalue %struct.int16x4x4_t %.fca.0.1.insert, <4 x i16> %lane2, 0, 2
 702   %.fca.0.3.insert = insertvalue %struct.int16x4x4_t %.fca.0.2.insert, <4 x i16> %lane3, 0, 3
 703   ret %struct.int16x4x4_t %.fca.0.3.insert
 704 }
 705
 706 define %struct.int32x2x4_t @test_vld4_dup_s32(i32* %a) {
 707 ; CHECK-LABEL: test_vld4_dup_s32
 708 ; CHECK: ld4r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
 709 entry:
 710   %0 = bitcast i32* %a to i8*
 711   %vld_dup = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %0, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
 712   %1 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 0
 713   %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
 714   %2 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 1
 715   %lane1 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer
 716   %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 2
 717   %lane2 = shufflevector <2 x i32> %3, <2 x i32> undef, <2 x i32> zeroinitializer
 718   %4 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 3
 719   %lane3 = shufflevector <2 x i32> %4, <2 x i32> undef, <2 x i32> zeroinitializer
 720   %.fca.0.0.insert = insertvalue %struct.int32x2x4_t undef, <2 x i32> %lane, 0, 0
 721   %.fca.0.1.insert = insertvalue %struct.int32x2x4_t %.fca.0.0.insert, <2 x i32> %lane1, 0, 1
 722   %.fca.0.2.insert = insertvalue %struct.int32x2x4_t %.fca.0.1.insert, <2 x i32> %lane2, 0, 2
 723   %.fca.0.3.insert = insertvalue %struct.int32x2x4_t %.fca.0.2.insert, <2 x i32> %lane3, 0, 3
 724   ret %struct.int32x2x4_t %.fca.0.3.insert
 725 }
 726
 727 define %struct.int64x1x4_t @test_vld4_dup_s64(i64* %a) {
 728 ; CHECK-LABEL: test_vld4_dup_s64
 729 ; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
 730 entry:
 731   %0 = bitcast i64* %a to i8*
 732   %vld_dup = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8* %0, i32 8)
 733   %vld_dup.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 0
 734   %vld_dup.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 1
 735   %vld_dup.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 2
 736   %vld_dup.fca.3.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 3
 737   %.fca.0.0.insert = insertvalue %struct.int64x1x4_t undef, <1 x i64> %vld_dup.fca.0.extract, 0, 0
 738   %.fca.0.1.insert = insertvalue %struct.int64x1x4_t %.fca.0.0.insert, <1 x i64> %vld_dup.fca.1.extract, 0, 1
 739   %.fca.0.2.insert = insertvalue %struct.int64x1x4_t %.fca.0.1.insert, <1 x i64> %vld_dup.fca.2.extract, 0, 2
 740   %.fca.0.3.insert = insertvalue %struct.int64x1x4_t %.fca.0.2.insert, <1 x i64> %vld_dup.fca.3.extract, 0, 3
 741   ret %struct.int64x1x4_t %.fca.0.3.insert
 742 }
 743
 744 define %struct.float32x2x4_t @test_vld4_dup_f32(float* %a) {
 745 ; CHECK-LABEL: test_vld4_dup_f32
 746 ; CHECK: ld4r {{{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s}, [x0]
 747 entry:
 748   %0 = bitcast float* %a to i8*
 749   %vld_dup = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32(i8* %0, <2 x float> undef, <2 x float> undef, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
 750   %1 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 0
 751   %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
 752   %2 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 1
 753   %lane1 = shufflevector <2 x float> %2, <2 x float> undef, <2 x i32> zeroinitializer
 754   %3 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 2
 755   %lane2 = shufflevector <2 x float> %3, <2 x float> undef, <2 x i32> zeroinitializer
 756   %4 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 3
 757   %lane3 = shufflevector <2 x float> %4, <2 x float> undef, <2 x i32> zeroinitializer
 758   %.fca.0.0.insert = insertvalue %struct.float32x2x4_t undef, <2 x float> %lane, 0, 0
 759   %.fca.0.1.insert = insertvalue %struct.float32x2x4_t %.fca.0.0.insert, <2 x float> %lane1, 0, 1
 760   %.fca.0.2.insert = insertvalue %struct.float32x2x4_t %.fca.0.1.insert, <2 x float> %lane2, 0, 2
 761   %.fca.0.3.insert = insertvalue %struct.float32x2x4_t %.fca.0.2.insert, <2 x float> %lane3, 0, 3
 762   ret %struct.float32x2x4_t %.fca.0.3.insert
 763 }
 764
 765 define %struct.float64x1x4_t @test_vld4_dup_f64(double* %a) {
 766 ; CHECK-LABEL: test_vld4_dup_f64
 767 ; CHECK: ld1 {{{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d}, [x0]
 768 entry:
 769   %0 = bitcast double* %a to i8*
 770   %vld_dup = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8* %0, i32 8)
 771   %vld_dup.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 0
 772   %vld_dup.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 1
 773   %vld_dup.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 2
 774   %vld_dup.fca.3.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 3
 775   %.fca.0.0.insert = insertvalue %struct.float64x1x4_t undef, <1 x double> %vld_dup.fca.0.extract, 0, 0
 776   %.fca.0.1.insert = insertvalue %struct.float64x1x4_t %.fca.0.0.insert, <1 x double> %vld_dup.fca.1.extract, 0, 1
 777   %.fca.0.2.insert = insertvalue %struct.float64x1x4_t %.fca.0.1.insert, <1 x double> %vld_dup.fca.2.extract, 0, 2
 778   %.fca.0.3.insert = insertvalue %struct.float64x1x4_t %.fca.0.2.insert, <1 x double> %vld_dup.fca.3.extract, 0, 3
 779   ret %struct.float64x1x4_t %.fca.0.3.insert
 780 }
 781
 782 define <16 x i8> @test_vld1q_lane_s8(i8* %a, <16 x i8> %b) {
 783 ; CHECK-LABEL: test_vld1q_lane_s8
 784 ; CHECK: ld1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
 785 entry:
 786   %0 = load i8* %a, align 1
 787   %vld1_lane = insertelement <16 x i8> %b, i8 %0, i32 15
 788   ret <16 x i8> %vld1_lane
 789 }
 790
 791 define <8 x i16> @test_vld1q_lane_s16(i16* %a, <8 x i16> %b) {
 792 ; CHECK-LABEL: test_vld1q_lane_s16
 793 ; CHECK: ld1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
 794 entry:
 795   %0 = load i16* %a, align 2
 796   %vld1_lane = insertelement <8 x i16> %b, i16 %0, i32 7
 797   ret <8 x i16> %vld1_lane
 798 }
 799
 800 define <4 x i32> @test_vld1q_lane_s32(i32* %a, <4 x i32> %b) {
 801 ; CHECK-LABEL: test_vld1q_lane_s32
 802 ; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
 803 entry:
 804   %0 = load i32* %a, align 4
 805   %vld1_lane = insertelement <4 x i32> %b, i32 %0, i32 3
 806   ret <4 x i32> %vld1_lane
 807 }
 808
 809 define <2 x i64> @test_vld1q_lane_s64(i64* %a, <2 x i64> %b) {
 810 ; CHECK-LABEL: test_vld1q_lane_s64
 811 ; CHECK: ld1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
 812 entry:
 813   %0 = load i64* %a, align 8
 814   %vld1_lane = insertelement <2 x i64> %b, i64 %0, i32 1
 815   ret <2 x i64> %vld1_lane
 816 }
 817
 818 define <4 x float> @test_vld1q_lane_f32(float* %a, <4 x float> %b) {
 819 ; CHECK-LABEL: test_vld1q_lane_f32
 820 ; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
 821 entry:
 822   %0 = load float* %a, align 4
 823   %vld1_lane = insertelement <4 x float> %b, float %0, i32 3
 824   ret <4 x float> %vld1_lane
 825 }
 826
 827 define <2 x double> @test_vld1q_lane_f64(double* %a, <2 x double> %b) {
 828 ; CHECK-LABEL: test_vld1q_lane_f64
 829 ; CHECK: ld1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
 830 entry:
 831   %0 = load double* %a, align 8
 832   %vld1_lane = insertelement <2 x double> %b, double %0, i32 1
 833   ret <2 x double> %vld1_lane
 834 }
 835
 836 define <8 x i8> @test_vld1_lane_s8(i8* %a, <8 x i8> %b) {
 837 ; CHECK-LABEL: test_vld1_lane_s8
 838 ; CHECK: ld1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
 839 entry:
 840   %0 = load i8* %a, align 1
 841   %vld1_lane = insertelement <8 x i8> %b, i8 %0, i32 7
 842   ret <8 x i8> %vld1_lane
 843 }
 844
 845 define <4 x i16> @test_vld1_lane_s16(i16* %a, <4 x i16> %b) {
 846 ; CHECK-LABEL: test_vld1_lane_s16
 847 ; CHECK: ld1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
 848 entry:
 849   %0 = load i16* %a, align 2
 850   %vld1_lane = insertelement <4 x i16> %b, i16 %0, i32 3
 851   ret <4 x i16> %vld1_lane
 852 }
 853
 854 define <2 x i32> @test_vld1_lane_s32(i32* %a, <2 x i32> %b) {
 855 ; CHECK-LABEL: test_vld1_lane_s32
 856 ; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
 857 entry:
 858   %0 = load i32* %a, align 4
 859   %vld1_lane = insertelement <2 x i32> %b, i32 %0, i32 1
 860   ret <2 x i32> %vld1_lane
 861 }
 862
 863 define <1 x i64> @test_vld1_lane_s64(i64* %a, <1 x i64> %b) {
 864 ; CHECK-LABEL: test_vld1_lane_s64
 865 ; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
 866 entry:
 867   %0 = load i64* %a, align 8
 868   %vld1_lane = insertelement <1 x i64> undef, i64 %0, i32 0
 869   ret <1 x i64> %vld1_lane
 870 }
 871
 872 define <2 x float> @test_vld1_lane_f32(float* %a, <2 x float> %b) {
 873 ; CHECK-LABEL: test_vld1_lane_f32
 874 ; CHECK: ld1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
 875 entry:
 876   %0 = load float* %a, align 4
 877   %vld1_lane = insertelement <2 x float> %b, float %0, i32 1
 878   ret <2 x float> %vld1_lane
 879 }
 880
 881 define <1 x double> @test_vld1_lane_f64(double* %a, <1 x double> %b) {
 882 ; CHECK-LABEL: test_vld1_lane_f64
 883 ; CHECK: ld1r {{{v[0-9]+}}.1d}, [x0]
 884 entry:
 885   %0 = load double* %a, align 8
 886   %vld1_lane = insertelement <1 x double> undef, double %0, i32 0
 887   ret <1 x double> %vld1_lane
 888 }
 889
 890 define %struct.int16x8x2_t @test_vld2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) {
 891 ; CHECK-LABEL: test_vld2q_lane_s16
 892 ; CHECK: ld2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
 893 entry:
 894   %b.coerce.fca.0.extract = extractvalue [2 x <8 x i16>] %b.coerce, 0
 895   %b.coerce.fca.1.extract = extractvalue [2 x <8 x i16>] %b.coerce, 1
 896   %0 = bitcast i16* %a to i8*
 897   %vld2_lane = tail call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, i32 7, i32 2)
 898   %vld2_lane.fca.0.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2_lane, 0
 899   %vld2_lane.fca.1.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2_lane, 1
 900   %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %vld2_lane.fca.0.extract, 0, 0
 901   %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %vld2_lane.fca.1.extract, 0, 1
 902   ret %struct.int16x8x2_t %.fca.0.1.insert
 903 }
 904
 905 define %struct.int32x4x2_t @test_vld2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) {
 906 ; CHECK-LABEL: test_vld2q_lane_s32
 907 ; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
 908 entry:
 909   %b.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %b.coerce, 0
 910   %b.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %b.coerce, 1
 911   %0 = bitcast i32* %a to i8*
 912   %vld2_lane = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, i32 3, i32 4)
 913   %vld2_lane.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2_lane, 0
 914   %vld2_lane.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2_lane, 1
 915   %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %vld2_lane.fca.0.extract, 0, 0
 916   %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %vld2_lane.fca.1.extract, 0, 1
 917   ret %struct.int32x4x2_t %.fca.0.1.insert
 918 }
 919
 920 define %struct.int64x2x2_t @test_vld2q_lane_s64(i64* %a, [2 x <2 x i64>] %b.coerce) {
 921 ; CHECK-LABEL: test_vld2q_lane_s64
 922 ; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
 923 entry:
 924   %b.coerce.fca.0.extract = extractvalue [2 x <2 x i64>] %b.coerce, 0
 925   %b.coerce.fca.1.extract = extractvalue [2 x <2 x i64>] %b.coerce, 1
 926   %0 = bitcast i64* %a to i8*
 927   %vld2_lane = tail call { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, i32 1, i32 8)
 928   %vld2_lane.fca.0.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2_lane, 0
 929   %vld2_lane.fca.1.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2_lane, 1
 930   %.fca.0.0.insert = insertvalue %struct.int64x2x2_t undef, <2 x i64> %vld2_lane.fca.0.extract, 0, 0
 931   %.fca.0.1.insert = insertvalue %struct.int64x2x2_t %.fca.0.0.insert, <2 x i64> %vld2_lane.fca.1.extract, 0, 1
 932   ret %struct.int64x2x2_t %.fca.0.1.insert
 933 }
 934
 935 define %struct.float32x4x2_t @test_vld2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) {
 936 ; CHECK-LABEL: test_vld2q_lane_f32
 937 ; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
 938 entry:
 939   %b.coerce.fca.0.extract = extractvalue [2 x <4 x float>] %b.coerce, 0
 940   %b.coerce.fca.1.extract = extractvalue [2 x <4 x float>] %b.coerce, 1
 941   %0 = bitcast float* %a to i8*
 942   %vld2_lane = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, i32 3, i32 4)
 943   %vld2_lane.fca.0.extract = extractvalue { <4 x float>, <4 x float> } %vld2_lane, 0
 944   %vld2_lane.fca.1.extract = extractvalue { <4 x float>, <4 x float> } %vld2_lane, 1
 945   %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %vld2_lane.fca.0.extract, 0, 0
 946   %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %vld2_lane.fca.1.extract, 0, 1
 947   ret %struct.float32x4x2_t %.fca.0.1.insert
 948 }
 949
 950 define %struct.float64x2x2_t @test_vld2q_lane_f64(double* %a, [2 x <2 x double>] %b.coerce) {
 951 ; CHECK-LABEL: test_vld2q_lane_f64
 952 ; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
 953 entry:
 954   %b.coerce.fca.0.extract = extractvalue [2 x <2 x double>] %b.coerce, 0
 955   %b.coerce.fca.1.extract = extractvalue [2 x <2 x double>] %b.coerce, 1
 956   %0 = bitcast double* %a to i8*
 957   %vld2_lane = tail call { <2 x double>, <2 x double> } @llvm.arm.neon.vld2lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, i32 1, i32 8)
 958   %vld2_lane.fca.0.extract = extractvalue { <2 x double>, <2 x double> } %vld2_lane, 0
 959   %vld2_lane.fca.1.extract = extractvalue { <2 x double>, <2 x double> } %vld2_lane, 1
 960   %.fca.0.0.insert = insertvalue %struct.float64x2x2_t undef, <2 x double> %vld2_lane.fca.0.extract, 0, 0
 961   %.fca.0.1.insert = insertvalue %struct.float64x2x2_t %.fca.0.0.insert, <2 x double> %vld2_lane.fca.1.extract, 0, 1
 962   ret %struct.float64x2x2_t %.fca.0.1.insert
 963 }
 964
 965 define %struct.int8x8x2_t @test_vld2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) {
 966 ; CHECK-LABEL: test_vld2_lane_s8
 967 ; CHECK: ld2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
 968 entry:
 969   %b.coerce.fca.0.extract = extractvalue [2 x <8 x i8>] %b.coerce, 0
 970   %b.coerce.fca.1.extract = extractvalue [2 x <8 x i8>] %b.coerce, 1
 971   %vld2_lane = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, i32 7, i32 1)
 972   %vld2_lane.fca.0.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2_lane, 0
 973   %vld2_lane.fca.1.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2_lane, 1
 974   %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %vld2_lane.fca.0.extract, 0, 0
 975   %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %vld2_lane.fca.1.extract, 0, 1
 976   ret %struct.int8x8x2_t %.fca.0.1.insert
 977 }
 978
 979 define %struct.int16x4x2_t @test_vld2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) {
 980 ; CHECK-LABEL: test_vld2_lane_s16
 981 ; CHECK: ld2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
 982 entry:
 983   %b.coerce.fca.0.extract = extractvalue [2 x <4 x i16>] %b.coerce, 0
 984   %b.coerce.fca.1.extract = extractvalue [2 x <4 x i16>] %b.coerce, 1
 985   %0 = bitcast i16* %a to i8*
 986   %vld2_lane = tail call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, i32 3, i32 2)
 987   %vld2_lane.fca.0.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2_lane, 0
 988   %vld2_lane.fca.1.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2_lane, 1
 989   %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %vld2_lane.fca.0.extract, 0, 0
 990   %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %vld2_lane.fca.1.extract, 0, 1
 991   ret %struct.int16x4x2_t %.fca.0.1.insert
 992 }
 993
 994 define %struct.int32x2x2_t @test_vld2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) {
 995 ; CHECK-LABEL: test_vld2_lane_s32
 996 ; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
 997 entry:
 998   %b.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %b.coerce, 0
 999   %b.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %b.coerce, 1
1000   %0 = bitcast i32* %a to i8*
1001   %vld2_lane = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, i32 1, i32 4)
1002   %vld2_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2_lane, 0
1003   %vld2_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2_lane, 1
1004   %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %vld2_lane.fca.0.extract, 0, 0
1005   %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %vld2_lane.fca.1.extract, 0, 1
1006   ret %struct.int32x2x2_t %.fca.0.1.insert
1007 }
1008
1009 define %struct.int64x1x2_t @test_vld2_lane_s64(i64* %a, [2 x <1 x i64>] %b.coerce) {
1010 ; CHECK-LABEL: test_vld2_lane_s64
1011 ; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1012 entry:
1013   %b.coerce.fca.0.extract = extractvalue [2 x <1 x i64>] %b.coerce, 0
1014   %b.coerce.fca.1.extract = extractvalue [2 x <1 x i64>] %b.coerce, 1
1015   %0 = bitcast i64* %a to i8*
1016   %vld2_lane = tail call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, i32 0, i32 8)
1017   %vld2_lane.fca.0.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2_lane, 0
1018   %vld2_lane.fca.1.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2_lane, 1
1019   %.fca.0.0.insert = insertvalue %struct.int64x1x2_t undef, <1 x i64> %vld2_lane.fca.0.extract, 0, 0
1020   %.fca.0.1.insert = insertvalue %struct.int64x1x2_t %.fca.0.0.insert, <1 x i64> %vld2_lane.fca.1.extract, 0, 1
1021   ret %struct.int64x1x2_t %.fca.0.1.insert
1022 }
1023
1024 define %struct.float32x2x2_t @test_vld2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) {
1025 ; CHECK-LABEL: test_vld2_lane_f32
1026 ; CHECK: ld2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1027 entry:
1028   %b.coerce.fca.0.extract = extractvalue [2 x <2 x float>] %b.coerce, 0
1029   %b.coerce.fca.1.extract = extractvalue [2 x <2 x float>] %b.coerce, 1
1030   %0 = bitcast float* %a to i8*
1031   %vld2_lane = tail call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, i32 1, i32 4)
1032   %vld2_lane.fca.0.extract = extractvalue { <2 x float>, <2 x float> } %vld2_lane, 0
1033   %vld2_lane.fca.1.extract = extractvalue { <2 x float>, <2 x float> } %vld2_lane, 1
1034   %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %vld2_lane.fca.0.extract, 0, 0
1035   %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %vld2_lane.fca.1.extract, 0, 1
1036   ret %struct.float32x2x2_t %.fca.0.1.insert
1037 }
1038
1039 define %struct.float64x1x2_t @test_vld2_lane_f64(double* %a, [2 x <1 x double>] %b.coerce) {
1040 ; CHECK-LABEL: test_vld2_lane_f64
1041 ; CHECK: ld2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1042 entry:
1043   %b.coerce.fca.0.extract = extractvalue [2 x <1 x double>] %b.coerce, 0
1044   %b.coerce.fca.1.extract = extractvalue [2 x <1 x double>] %b.coerce, 1
1045   %0 = bitcast double* %a to i8*
1046   %vld2_lane = tail call { <1 x double>, <1 x double> } @llvm.arm.neon.vld2lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, i32 0, i32 8)
1047   %vld2_lane.fca.0.extract = extractvalue { <1 x double>, <1 x double> } %vld2_lane, 0
1048   %vld2_lane.fca.1.extract = extractvalue { <1 x double>, <1 x double> } %vld2_lane, 1
1049   %.fca.0.0.insert = insertvalue %struct.float64x1x2_t undef, <1 x double> %vld2_lane.fca.0.extract, 0, 0
1050   %.fca.0.1.insert = insertvalue %struct.float64x1x2_t %.fca.0.0.insert, <1 x double> %vld2_lane.fca.1.extract, 0, 1
1051   ret %struct.float64x1x2_t %.fca.0.1.insert
1052 }
1053
1054 define %struct.int16x8x3_t @test_vld3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) {
1055 ; CHECK-LABEL: test_vld3q_lane_s16
1056 ; CHECK: ld3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1057 entry:
1058   %b.coerce.fca.0.extract = extractvalue [3 x <8 x i16>] %b.coerce, 0
1059   %b.coerce.fca.1.extract = extractvalue [3 x <8 x i16>] %b.coerce, 1
1060   %b.coerce.fca.2.extract = extractvalue [3 x <8 x i16>] %b.coerce, 2
1061   %0 = bitcast i16* %a to i8*
1062   %vld3_lane = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, i32 7, i32 2)
1063   %vld3_lane.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 0
1064   %vld3_lane.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 1
1065   %vld3_lane.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 2
1066   %.fca.0.0.insert = insertvalue %struct.int16x8x3_t undef, <8 x i16> %vld3_lane.fca.0.extract, 0, 0
1067   %.fca.0.1.insert = insertvalue %struct.int16x8x3_t %.fca.0.0.insert, <8 x i16> %vld3_lane.fca.1.extract, 0, 1
1068   %.fca.0.2.insert = insertvalue %struct.int16x8x3_t %.fca.0.1.insert, <8 x i16> %vld3_lane.fca.2.extract, 0, 2
1069   ret %struct.int16x8x3_t %.fca.0.2.insert
1070 }
1071
1072 define %struct.int32x4x3_t @test_vld3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) {
1073 ; CHECK-LABEL: test_vld3q_lane_s32
1074 ; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1075 entry:
1076   %b.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %b.coerce, 0
1077   %b.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %b.coerce, 1
1078   %b.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %b.coerce, 2
1079   %0 = bitcast i32* %a to i8*
1080   %vld3_lane = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, i32 3, i32 4)
1081   %vld3_lane.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 0
1082   %vld3_lane.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 1
1083   %vld3_lane.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 2
1084   %.fca.0.0.insert = insertvalue %struct.int32x4x3_t undef, <4 x i32> %vld3_lane.fca.0.extract, 0, 0
1085   %.fca.0.1.insert = insertvalue %struct.int32x4x3_t %.fca.0.0.insert, <4 x i32> %vld3_lane.fca.1.extract, 0, 1
1086   %.fca.0.2.insert = insertvalue %struct.int32x4x3_t %.fca.0.1.insert, <4 x i32> %vld3_lane.fca.2.extract, 0, 2
1087   ret %struct.int32x4x3_t %.fca.0.2.insert
1088 }
1089
1090 define %struct.int64x2x3_t @test_vld3q_lane_s64(i64* %a, [3 x <2 x i64>] %b.coerce) {
1091 ; CHECK-LABEL: test_vld3q_lane_s64
1092 ; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1093 entry:
1094   %b.coerce.fca.0.extract = extractvalue [3 x <2 x i64>] %b.coerce, 0
1095   %b.coerce.fca.1.extract = extractvalue [3 x <2 x i64>] %b.coerce, 1
1096   %b.coerce.fca.2.extract = extractvalue [3 x <2 x i64>] %b.coerce, 2
1097   %0 = bitcast i64* %a to i8*
1098   %vld3_lane = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, i32 1, i32 8)
1099   %vld3_lane.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 0
1100   %vld3_lane.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 1
1101   %vld3_lane.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 2
1102   %.fca.0.0.insert = insertvalue %struct.int64x2x3_t undef, <2 x i64> %vld3_lane.fca.0.extract, 0, 0
1103   %.fca.0.1.insert = insertvalue %struct.int64x2x3_t %.fca.0.0.insert, <2 x i64> %vld3_lane.fca.1.extract, 0, 1
1104   %.fca.0.2.insert = insertvalue %struct.int64x2x3_t %.fca.0.1.insert, <2 x i64> %vld3_lane.fca.2.extract, 0, 2
1105   ret %struct.int64x2x3_t %.fca.0.2.insert
1106 }
1107
1108 define %struct.float32x4x3_t @test_vld3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) {
1109 ; CHECK-LABEL: test_vld3q_lane_f32
1110 ; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1111 entry:
1112   %b.coerce.fca.0.extract = extractvalue [3 x <4 x float>] %b.coerce, 0
1113   %b.coerce.fca.1.extract = extractvalue [3 x <4 x float>] %b.coerce, 1
1114   %b.coerce.fca.2.extract = extractvalue [3 x <4 x float>] %b.coerce, 2
1115   %0 = bitcast float* %a to i8*
1116   %vld3_lane = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, i32 3, i32 4)
1117   %vld3_lane.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 0
1118   %vld3_lane.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 1
1119   %vld3_lane.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 2
1120   %.fca.0.0.insert = insertvalue %struct.float32x4x3_t undef, <4 x float> %vld3_lane.fca.0.extract, 0, 0
1121   %.fca.0.1.insert = insertvalue %struct.float32x4x3_t %.fca.0.0.insert, <4 x float> %vld3_lane.fca.1.extract, 0, 1
1122   %.fca.0.2.insert = insertvalue %struct.float32x4x3_t %.fca.0.1.insert, <4 x float> %vld3_lane.fca.2.extract, 0, 2
1123   ret %struct.float32x4x3_t %.fca.0.2.insert
1124 }
1125
1126 define %struct.float64x2x3_t @test_vld3q_lane_f64(double* %a, [3 x <2 x double>] %b.coerce) {
1127 ; CHECK-LABEL: test_vld3q_lane_f64
1128 ; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1129 entry:
1130   %b.coerce.fca.0.extract = extractvalue [3 x <2 x double>] %b.coerce, 0
1131   %b.coerce.fca.1.extract = extractvalue [3 x <2 x double>] %b.coerce, 1
1132   %b.coerce.fca.2.extract = extractvalue [3 x <2 x double>] %b.coerce, 2
1133   %0 = bitcast double* %a to i8*
1134   %vld3_lane = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, i32 1, i32 8)
1135   %vld3_lane.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 0
1136   %vld3_lane.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 1
1137   %vld3_lane.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 2
1138   %.fca.0.0.insert = insertvalue %struct.float64x2x3_t undef, <2 x double> %vld3_lane.fca.0.extract, 0, 0
1139   %.fca.0.1.insert = insertvalue %struct.float64x2x3_t %.fca.0.0.insert, <2 x double> %vld3_lane.fca.1.extract, 0, 1
1140   %.fca.0.2.insert = insertvalue %struct.float64x2x3_t %.fca.0.1.insert, <2 x double> %vld3_lane.fca.2.extract, 0, 2
1141   ret %struct.float64x2x3_t %.fca.0.2.insert
1142 }
1143
1144 define %struct.int8x8x3_t @test_vld3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) {
1145 ; CHECK-LABEL: test_vld3_lane_s8
1146 ; CHECK: ld3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1147 entry:
1148   %b.coerce.fca.0.extract = extractvalue [3 x <8 x i8>] %b.coerce, 0
1149   %b.coerce.fca.1.extract = extractvalue [3 x <8 x i8>] %b.coerce, 1
1150   %b.coerce.fca.2.extract = extractvalue [3 x <8 x i8>] %b.coerce, 2
1151   %vld3_lane = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, i32 7, i32 1)
1152   %vld3_lane.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 0
1153   %vld3_lane.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 1
1154   %vld3_lane.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 2
1155   %.fca.0.0.insert = insertvalue %struct.int8x8x3_t undef, <8 x i8> %vld3_lane.fca.0.extract, 0, 0
1156   %.fca.0.1.insert = insertvalue %struct.int8x8x3_t %.fca.0.0.insert, <8 x i8> %vld3_lane.fca.1.extract, 0, 1
1157   %.fca.0.2.insert = insertvalue %struct.int8x8x3_t %.fca.0.1.insert, <8 x i8> %vld3_lane.fca.2.extract, 0, 2
1158   ret %struct.int8x8x3_t %.fca.0.2.insert
1159 }
1160
1161 define %struct.int16x4x3_t @test_vld3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) {
1162 ; CHECK-LABEL: test_vld3_lane_s16
1163 ; CHECK: ld3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1164 entry:
1165   %b.coerce.fca.0.extract = extractvalue [3 x <4 x i16>] %b.coerce, 0
1166   %b.coerce.fca.1.extract = extractvalue [3 x <4 x i16>] %b.coerce, 1
1167   %b.coerce.fca.2.extract = extractvalue [3 x <4 x i16>] %b.coerce, 2
1168   %0 = bitcast i16* %a to i8*
1169   %vld3_lane = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, i32 3, i32 2)
1170   %vld3_lane.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 0
1171   %vld3_lane.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 1
1172   %vld3_lane.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 2
1173   %.fca.0.0.insert = insertvalue %struct.int16x4x3_t undef, <4 x i16> %vld3_lane.fca.0.extract, 0, 0
1174   %.fca.0.1.insert = insertvalue %struct.int16x4x3_t %.fca.0.0.insert, <4 x i16> %vld3_lane.fca.1.extract, 0, 1
1175   %.fca.0.2.insert = insertvalue %struct.int16x4x3_t %.fca.0.1.insert, <4 x i16> %vld3_lane.fca.2.extract, 0, 2
1176   ret %struct.int16x4x3_t %.fca.0.2.insert
1177 }
1178
1179 define %struct.int32x2x3_t @test_vld3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) {
1180 ; CHECK-LABEL: test_vld3_lane_s32
1181 ; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1182 entry:
1183   %b.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %b.coerce, 0
1184   %b.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %b.coerce, 1
1185   %b.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %b.coerce, 2
1186   %0 = bitcast i32* %a to i8*
1187   %vld3_lane = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, i32 1, i32 4)
1188   %vld3_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 0
1189   %vld3_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 1
1190   %vld3_lane.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 2
1191   %.fca.0.0.insert = insertvalue %struct.int32x2x3_t undef, <2 x i32> %vld3_lane.fca.0.extract, 0, 0
1192   %.fca.0.1.insert = insertvalue %struct.int32x2x3_t %.fca.0.0.insert, <2 x i32> %vld3_lane.fca.1.extract, 0, 1
1193   %.fca.0.2.insert = insertvalue %struct.int32x2x3_t %.fca.0.1.insert, <2 x i32> %vld3_lane.fca.2.extract, 0, 2
1194   ret %struct.int32x2x3_t %.fca.0.2.insert
1195 }
1196
1197 define %struct.int64x1x3_t @test_vld3_lane_s64(i64* %a, [3 x <1 x i64>] %b.coerce) {
1198 ; CHECK-LABEL: test_vld3_lane_s64
1199 ; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1200 entry:
1201   %b.coerce.fca.0.extract = extractvalue [3 x <1 x i64>] %b.coerce, 0
1202   %b.coerce.fca.1.extract = extractvalue [3 x <1 x i64>] %b.coerce, 1
1203   %b.coerce.fca.2.extract = extractvalue [3 x <1 x i64>] %b.coerce, 2
1204   %0 = bitcast i64* %a to i8*
1205   %vld3_lane = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, i32 0, i32 8)
1206   %vld3_lane.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 0
1207   %vld3_lane.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 1
1208   %vld3_lane.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 2
1209   %.fca.0.0.insert = insertvalue %struct.int64x1x3_t undef, <1 x i64> %vld3_lane.fca.0.extract, 0, 0
1210   %.fca.0.1.insert = insertvalue %struct.int64x1x3_t %.fca.0.0.insert, <1 x i64> %vld3_lane.fca.1.extract, 0, 1
1211   %.fca.0.2.insert = insertvalue %struct.int64x1x3_t %.fca.0.1.insert, <1 x i64> %vld3_lane.fca.2.extract, 0, 2
1212   ret %struct.int64x1x3_t %.fca.0.2.insert
1213 }
1214
1215 define %struct.float32x2x3_t @test_vld3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) {
1216 ; CHECK-LABEL: test_vld3_lane_f32
1217 ; CHECK: ld3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1218 entry:
1219   %b.coerce.fca.0.extract = extractvalue [3 x <2 x float>] %b.coerce, 0
1220   %b.coerce.fca.1.extract = extractvalue [3 x <2 x float>] %b.coerce, 1
1221   %b.coerce.fca.2.extract = extractvalue [3 x <2 x float>] %b.coerce, 2
1222   %0 = bitcast float* %a to i8*
1223   %vld3_lane = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, i32 1, i32 4)
1224   %vld3_lane.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 0
1225   %vld3_lane.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 1
1226   %vld3_lane.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 2
1227   %.fca.0.0.insert = insertvalue %struct.float32x2x3_t undef, <2 x float> %vld3_lane.fca.0.extract, 0, 0
1228   %.fca.0.1.insert = insertvalue %struct.float32x2x3_t %.fca.0.0.insert, <2 x float> %vld3_lane.fca.1.extract, 0, 1
1229   %.fca.0.2.insert = insertvalue %struct.float32x2x3_t %.fca.0.1.insert, <2 x float> %vld3_lane.fca.2.extract, 0, 2
1230   ret %struct.float32x2x3_t %.fca.0.2.insert
1231 }
1232
1233 define %struct.float64x1x3_t @test_vld3_lane_f64(double* %a, [3 x <1 x double>] %b.coerce) {
1234 ; CHECK-LABEL: test_vld3_lane_f64
1235 ; CHECK: ld3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1236 entry:
1237   %b.coerce.fca.0.extract = extractvalue [3 x <1 x double>] %b.coerce, 0
1238   %b.coerce.fca.1.extract = extractvalue [3 x <1 x double>] %b.coerce, 1
1239   %b.coerce.fca.2.extract = extractvalue [3 x <1 x double>] %b.coerce, 2
1240   %0 = bitcast double* %a to i8*
1241   %vld3_lane = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, i32 0, i32 8)
1242   %vld3_lane.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 0
1243   %vld3_lane.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 1
1244   %vld3_lane.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 2
1245   %.fca.0.0.insert = insertvalue %struct.float64x1x3_t undef, <1 x double> %vld3_lane.fca.0.extract, 0, 0
1246   %.fca.0.1.insert = insertvalue %struct.float64x1x3_t %.fca.0.0.insert, <1 x double> %vld3_lane.fca.1.extract, 0, 1
1247   %.fca.0.2.insert = insertvalue %struct.float64x1x3_t %.fca.0.1.insert, <1 x double> %vld3_lane.fca.2.extract, 0, 2
1248   ret %struct.float64x1x3_t %.fca.0.2.insert
1249 }
1250
1251 define %struct.int8x16x4_t @test_vld4q_lane_s8(i8* %a, [4 x <16 x i8>] %b.coerce) {
1252 ; CHECK-LABEL: test_vld4q_lane_s8
1253 ; CHECK: ld4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1254 entry:
1255   %b.coerce.fca.0.extract = extractvalue [4 x <16 x i8>] %b.coerce, 0
1256   %b.coerce.fca.1.extract = extractvalue [4 x <16 x i8>] %b.coerce, 1
1257   %b.coerce.fca.2.extract = extractvalue [4 x <16 x i8>] %b.coerce, 2
1258   %b.coerce.fca.3.extract = extractvalue [4 x <16 x i8>] %b.coerce, 3
1259   %vld3_lane = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4lane.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, <16 x i8> %b.coerce.fca.3.extract, i32 15, i32 1)
1260   %vld3_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 0
1261   %vld3_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 1
1262   %vld3_lane.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 2
1263   %vld3_lane.fca.3.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 3
1264   %.fca.0.0.insert = insertvalue %struct.int8x16x4_t undef, <16 x i8> %vld3_lane.fca.0.extract, 0, 0
1265   %.fca.0.1.insert = insertvalue %struct.int8x16x4_t %.fca.0.0.insert, <16 x i8> %vld3_lane.fca.1.extract, 0, 1
1266   %.fca.0.2.insert = insertvalue %struct.int8x16x4_t %.fca.0.1.insert, <16 x i8> %vld3_lane.fca.2.extract, 0, 2
1267   %.fca.0.3.insert = insertvalue %struct.int8x16x4_t %.fca.0.2.insert, <16 x i8> %vld3_lane.fca.3.extract, 0, 3
1268   ret %struct.int8x16x4_t %.fca.0.3.insert
1269 }
1270
1271 define %struct.int16x8x4_t @test_vld4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) {
1272 ; CHECK-LABEL: test_vld4q_lane_s16
1273 ; CHECK: ld4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1274 entry:
1275   %b.coerce.fca.0.extract = extractvalue [4 x <8 x i16>] %b.coerce, 0
1276   %b.coerce.fca.1.extract = extractvalue [4 x <8 x i16>] %b.coerce, 1
1277   %b.coerce.fca.2.extract = extractvalue [4 x <8 x i16>] %b.coerce, 2
1278   %b.coerce.fca.3.extract = extractvalue [4 x <8 x i16>] %b.coerce, 3
1279   %0 = bitcast i16* %a to i8*
1280   %vld3_lane = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, <8 x i16> %b.coerce.fca.3.extract, i32 7, i32 2)
1281   %vld3_lane.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 0
1282   %vld3_lane.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 1
1283   %vld3_lane.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 2
1284   %vld3_lane.fca.3.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 3
1285   %.fca.0.0.insert = insertvalue %struct.int16x8x4_t undef, <8 x i16> %vld3_lane.fca.0.extract, 0, 0
1286   %.fca.0.1.insert = insertvalue %struct.int16x8x4_t %.fca.0.0.insert, <8 x i16> %vld3_lane.fca.1.extract, 0, 1
1287   %.fca.0.2.insert = insertvalue %struct.int16x8x4_t %.fca.0.1.insert, <8 x i16> %vld3_lane.fca.2.extract, 0, 2
1288   %.fca.0.3.insert = insertvalue %struct.int16x8x4_t %.fca.0.2.insert, <8 x i16> %vld3_lane.fca.3.extract, 0, 3
1289   ret %struct.int16x8x4_t %.fca.0.3.insert
1290 }
1291
1292 define %struct.int32x4x4_t @test_vld4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) {
1293 ; CHECK-LABEL: test_vld4q_lane_s32
1294 ; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1295 entry:
1296   %b.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %b.coerce, 0
1297   %b.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %b.coerce, 1
1298   %b.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %b.coerce, 2
1299   %b.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %b.coerce, 3
1300   %0 = bitcast i32* %a to i8*
1301   %vld3_lane = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, <4 x i32> %b.coerce.fca.3.extract, i32 3, i32 4)
1302   %vld3_lane.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 0
1303   %vld3_lane.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 1
1304   %vld3_lane.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 2
1305   %vld3_lane.fca.3.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 3
1306   %.fca.0.0.insert = insertvalue %struct.int32x4x4_t undef, <4 x i32> %vld3_lane.fca.0.extract, 0, 0
1307   %.fca.0.1.insert = insertvalue %struct.int32x4x4_t %.fca.0.0.insert, <4 x i32> %vld3_lane.fca.1.extract, 0, 1
1308   %.fca.0.2.insert = insertvalue %struct.int32x4x4_t %.fca.0.1.insert, <4 x i32> %vld3_lane.fca.2.extract, 0, 2
1309   %.fca.0.3.insert = insertvalue %struct.int32x4x4_t %.fca.0.2.insert, <4 x i32> %vld3_lane.fca.3.extract, 0, 3
1310   ret %struct.int32x4x4_t %.fca.0.3.insert
1311 }
1312
1313 define %struct.int64x2x4_t @test_vld4q_lane_s64(i64* %a, [4 x <2 x i64>] %b.coerce) {
1314 ; CHECK-LABEL: test_vld4q_lane_s64
1315 ; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1316 entry:
1317   %b.coerce.fca.0.extract = extractvalue [4 x <2 x i64>] %b.coerce, 0
1318   %b.coerce.fca.1.extract = extractvalue [4 x <2 x i64>] %b.coerce, 1
1319   %b.coerce.fca.2.extract = extractvalue [4 x <2 x i64>] %b.coerce, 2
1320   %b.coerce.fca.3.extract = extractvalue [4 x <2 x i64>] %b.coerce, 3
1321   %0 = bitcast i64* %a to i8*
1322   %vld3_lane = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, <2 x i64> %b.coerce.fca.3.extract, i32 1, i32 8)
1323   %vld3_lane.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 0
1324   %vld3_lane.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 1
1325   %vld3_lane.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 2
1326   %vld3_lane.fca.3.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 3
1327   %.fca.0.0.insert = insertvalue %struct.int64x2x4_t undef, <2 x i64> %vld3_lane.fca.0.extract, 0, 0
1328   %.fca.0.1.insert = insertvalue %struct.int64x2x4_t %.fca.0.0.insert, <2 x i64> %vld3_lane.fca.1.extract, 0, 1
1329   %.fca.0.2.insert = insertvalue %struct.int64x2x4_t %.fca.0.1.insert, <2 x i64> %vld3_lane.fca.2.extract, 0, 2
1330   %.fca.0.3.insert = insertvalue %struct.int64x2x4_t %.fca.0.2.insert, <2 x i64> %vld3_lane.fca.3.extract, 0, 3
1331   ret %struct.int64x2x4_t %.fca.0.3.insert
1332 }
1333
1334 define %struct.float32x4x4_t @test_vld4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) {
1335 ; CHECK-LABEL: test_vld4q_lane_f32
1336 ; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1337 entry:
1338   %b.coerce.fca.0.extract = extractvalue [4 x <4 x float>] %b.coerce, 0
1339   %b.coerce.fca.1.extract = extractvalue [4 x <4 x float>] %b.coerce, 1
1340   %b.coerce.fca.2.extract = extractvalue [4 x <4 x float>] %b.coerce, 2
1341   %b.coerce.fca.3.extract = extractvalue [4 x <4 x float>] %b.coerce, 3
1342   %0 = bitcast float* %a to i8*
1343   %vld3_lane = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, <4 x float> %b.coerce.fca.3.extract, i32 3, i32 4)
1344   %vld3_lane.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 0
1345   %vld3_lane.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 1
1346   %vld3_lane.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 2
1347   %vld3_lane.fca.3.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 3
1348   %.fca.0.0.insert = insertvalue %struct.float32x4x4_t undef, <4 x float> %vld3_lane.fca.0.extract, 0, 0
1349   %.fca.0.1.insert = insertvalue %struct.float32x4x4_t %.fca.0.0.insert, <4 x float> %vld3_lane.fca.1.extract, 0, 1
1350   %.fca.0.2.insert = insertvalue %struct.float32x4x4_t %.fca.0.1.insert, <4 x float> %vld3_lane.fca.2.extract, 0, 2
1351   %.fca.0.3.insert = insertvalue %struct.float32x4x4_t %.fca.0.2.insert, <4 x float> %vld3_lane.fca.3.extract, 0, 3
1352   ret %struct.float32x4x4_t %.fca.0.3.insert
1353 }
1354
1355 define %struct.float64x2x4_t @test_vld4q_lane_f64(double* %a, [4 x <2 x double>] %b.coerce) {
1356 ; CHECK-LABEL: test_vld4q_lane_f64
1357 ; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1358 entry:
1359   %b.coerce.fca.0.extract = extractvalue [4 x <2 x double>] %b.coerce, 0
1360   %b.coerce.fca.1.extract = extractvalue [4 x <2 x double>] %b.coerce, 1
1361   %b.coerce.fca.2.extract = extractvalue [4 x <2 x double>] %b.coerce, 2
1362   %b.coerce.fca.3.extract = extractvalue [4 x <2 x double>] %b.coerce, 3
1363   %0 = bitcast double* %a to i8*
1364   %vld3_lane = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, <2 x double> %b.coerce.fca.3.extract, i32 1, i32 8)
1365   %vld3_lane.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 0
1366   %vld3_lane.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 1
1367   %vld3_lane.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 2
1368   %vld3_lane.fca.3.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 3
1369   %.fca.0.0.insert = insertvalue %struct.float64x2x4_t undef, <2 x double> %vld3_lane.fca.0.extract, 0, 0
1370   %.fca.0.1.insert = insertvalue %struct.float64x2x4_t %.fca.0.0.insert, <2 x double> %vld3_lane.fca.1.extract, 0, 1
1371   %.fca.0.2.insert = insertvalue %struct.float64x2x4_t %.fca.0.1.insert, <2 x double> %vld3_lane.fca.2.extract, 0, 2
1372   %.fca.0.3.insert = insertvalue %struct.float64x2x4_t %.fca.0.2.insert, <2 x double> %vld3_lane.fca.3.extract, 0, 3
1373   ret %struct.float64x2x4_t %.fca.0.3.insert
1374 }
1375
1376 define %struct.int8x8x4_t @test_vld4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) {
1377 ; CHECK-LABEL: test_vld4_lane_s8
1378 ; CHECK: ld4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1379 entry:
1380   %b.coerce.fca.0.extract = extractvalue [4 x <8 x i8>] %b.coerce, 0
1381   %b.coerce.fca.1.extract = extractvalue [4 x <8 x i8>] %b.coerce, 1
1382   %b.coerce.fca.2.extract = extractvalue [4 x <8 x i8>] %b.coerce, 2
1383   %b.coerce.fca.3.extract = extractvalue [4 x <8 x i8>] %b.coerce, 3
1384   %vld3_lane = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, <8 x i8> %b.coerce.fca.3.extract, i32 7, i32 1)
1385   %vld3_lane.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 0
1386   %vld3_lane.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 1
1387   %vld3_lane.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 2
1388   %vld3_lane.fca.3.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 3
1389   %.fca.0.0.insert = insertvalue %struct.int8x8x4_t undef, <8 x i8> %vld3_lane.fca.0.extract, 0, 0
1390   %.fca.0.1.insert = insertvalue %struct.int8x8x4_t %.fca.0.0.insert, <8 x i8> %vld3_lane.fca.1.extract, 0, 1
1391   %.fca.0.2.insert = insertvalue %struct.int8x8x4_t %.fca.0.1.insert, <8 x i8> %vld3_lane.fca.2.extract, 0, 2
1392   %.fca.0.3.insert = insertvalue %struct.int8x8x4_t %.fca.0.2.insert, <8 x i8> %vld3_lane.fca.3.extract, 0, 3
1393   ret %struct.int8x8x4_t %.fca.0.3.insert
1394 }
1395
1396 define %struct.int16x4x4_t @test_vld4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) {
1397 ; CHECK-LABEL: test_vld4_lane_s16
1398 ; CHECK: ld4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1399 entry:
1400   %b.coerce.fca.0.extract = extractvalue [4 x <4 x i16>] %b.coerce, 0
1401   %b.coerce.fca.1.extract = extractvalue [4 x <4 x i16>] %b.coerce, 1
1402   %b.coerce.fca.2.extract = extractvalue [4 x <4 x i16>] %b.coerce, 2
1403   %b.coerce.fca.3.extract = extractvalue [4 x <4 x i16>] %b.coerce, 3
1404   %0 = bitcast i16* %a to i8*
1405   %vld3_lane = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, <4 x i16> %b.coerce.fca.3.extract, i32 3, i32 2)
1406   %vld3_lane.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 0
1407   %vld3_lane.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 1
1408   %vld3_lane.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 2
1409   %vld3_lane.fca.3.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 3
1410   %.fca.0.0.insert = insertvalue %struct.int16x4x4_t undef, <4 x i16> %vld3_lane.fca.0.extract, 0, 0
1411   %.fca.0.1.insert = insertvalue %struct.int16x4x4_t %.fca.0.0.insert, <4 x i16> %vld3_lane.fca.1.extract, 0, 1
1412   %.fca.0.2.insert = insertvalue %struct.int16x4x4_t %.fca.0.1.insert, <4 x i16> %vld3_lane.fca.2.extract, 0, 2
1413   %.fca.0.3.insert = insertvalue %struct.int16x4x4_t %.fca.0.2.insert, <4 x i16> %vld3_lane.fca.3.extract, 0, 3
1414   ret %struct.int16x4x4_t %.fca.0.3.insert
1415 }
1416
1417 define %struct.int32x2x4_t @test_vld4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) {
1418 ; CHECK-LABEL: test_vld4_lane_s32
1419 ; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1420 entry:
1421   %b.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %b.coerce, 0
1422   %b.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %b.coerce, 1
1423   %b.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %b.coerce, 2
1424   %b.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %b.coerce, 3
1425   %0 = bitcast i32* %a to i8*
1426   %vld3_lane = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, <2 x i32> %b.coerce.fca.3.extract, i32 1, i32 4)
1427   %vld3_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 0
1428   %vld3_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 1
1429   %vld3_lane.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 2
1430   %vld3_lane.fca.3.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 3
1431   %.fca.0.0.insert = insertvalue %struct.int32x2x4_t undef, <2 x i32> %vld3_lane.fca.0.extract, 0, 0
1432   %.fca.0.1.insert = insertvalue %struct.int32x2x4_t %.fca.0.0.insert, <2 x i32> %vld3_lane.fca.1.extract, 0, 1
1433   %.fca.0.2.insert = insertvalue %struct.int32x2x4_t %.fca.0.1.insert, <2 x i32> %vld3_lane.fca.2.extract, 0, 2
1434   %.fca.0.3.insert = insertvalue %struct.int32x2x4_t %.fca.0.2.insert, <2 x i32> %vld3_lane.fca.3.extract, 0, 3
1435   ret %struct.int32x2x4_t %.fca.0.3.insert
1436 }
1437
1438 define %struct.int64x1x4_t @test_vld4_lane_s64(i64* %a, [4 x <1 x i64>] %b.coerce) {
1439 ; CHECK-LABEL: test_vld4_lane_s64
1440 ; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1441 entry:
1442   %b.coerce.fca.0.extract = extractvalue [4 x <1 x i64>] %b.coerce, 0
1443   %b.coerce.fca.1.extract = extractvalue [4 x <1 x i64>] %b.coerce, 1
1444   %b.coerce.fca.2.extract = extractvalue [4 x <1 x i64>] %b.coerce, 2
1445   %b.coerce.fca.3.extract = extractvalue [4 x <1 x i64>] %b.coerce, 3
1446   %0 = bitcast i64* %a to i8*
1447   %vld3_lane = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, <1 x i64> %b.coerce.fca.3.extract, i32 0, i32 8)
1448   %vld3_lane.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 0
1449   %vld3_lane.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 1
1450   %vld3_lane.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 2
1451   %vld3_lane.fca.3.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 3
1452   %.fca.0.0.insert = insertvalue %struct.int64x1x4_t undef, <1 x i64> %vld3_lane.fca.0.extract, 0, 0
1453   %.fca.0.1.insert = insertvalue %struct.int64x1x4_t %.fca.0.0.insert, <1 x i64> %vld3_lane.fca.1.extract, 0, 1
1454   %.fca.0.2.insert = insertvalue %struct.int64x1x4_t %.fca.0.1.insert, <1 x i64> %vld3_lane.fca.2.extract, 0, 2
1455   %.fca.0.3.insert = insertvalue %struct.int64x1x4_t %.fca.0.2.insert, <1 x i64> %vld3_lane.fca.3.extract, 0, 3
1456   ret %struct.int64x1x4_t %.fca.0.3.insert
1457 }
1458
1459 define %struct.float32x2x4_t @test_vld4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) {
1460 ; CHECK-LABEL: test_vld4_lane_f32
1461 ; CHECK: ld4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1462 entry:
1463   %b.coerce.fca.0.extract = extractvalue [4 x <2 x float>] %b.coerce, 0
1464   %b.coerce.fca.1.extract = extractvalue [4 x <2 x float>] %b.coerce, 1
1465   %b.coerce.fca.2.extract = extractvalue [4 x <2 x float>] %b.coerce, 2
1466   %b.coerce.fca.3.extract = extractvalue [4 x <2 x float>] %b.coerce, 3
1467   %0 = bitcast float* %a to i8*
1468   %vld3_lane = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, <2 x float> %b.coerce.fca.3.extract, i32 1, i32 4)
1469   %vld3_lane.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 0
1470   %vld3_lane.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 1
1471   %vld3_lane.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 2
1472   %vld3_lane.fca.3.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 3
1473   %.fca.0.0.insert = insertvalue %struct.float32x2x4_t undef, <2 x float> %vld3_lane.fca.0.extract, 0, 0
1474   %.fca.0.1.insert = insertvalue %struct.float32x2x4_t %.fca.0.0.insert, <2 x float> %vld3_lane.fca.1.extract, 0, 1
1475   %.fca.0.2.insert = insertvalue %struct.float32x2x4_t %.fca.0.1.insert, <2 x float> %vld3_lane.fca.2.extract, 0, 2
1476   %.fca.0.3.insert = insertvalue %struct.float32x2x4_t %.fca.0.2.insert, <2 x float> %vld3_lane.fca.3.extract, 0, 3
1477   ret %struct.float32x2x4_t %.fca.0.3.insert
1478 }
1479
1480 define %struct.float64x1x4_t @test_vld4_lane_f64(double* %a, [4 x <1 x double>] %b.coerce) {
1481 ; CHECK-LABEL: test_vld4_lane_f64
1482 ; CHECK: ld4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1483 entry:
1484   %b.coerce.fca.0.extract = extractvalue [4 x <1 x double>] %b.coerce, 0
1485   %b.coerce.fca.1.extract = extractvalue [4 x <1 x double>] %b.coerce, 1
1486   %b.coerce.fca.2.extract = extractvalue [4 x <1 x double>] %b.coerce, 2
1487   %b.coerce.fca.3.extract = extractvalue [4 x <1 x double>] %b.coerce, 3
1488   %0 = bitcast double* %a to i8*
1489   %vld3_lane = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, <1 x double> %b.coerce.fca.3.extract, i32 0, i32 8)
1490   %vld3_lane.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 0
1491   %vld3_lane.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 1
1492   %vld3_lane.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 2
1493   %vld3_lane.fca.3.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 3
1494   %.fca.0.0.insert = insertvalue %struct.float64x1x4_t undef, <1 x double> %vld3_lane.fca.0.extract, 0, 0
1495   %.fca.0.1.insert = insertvalue %struct.float64x1x4_t %.fca.0.0.insert, <1 x double> %vld3_lane.fca.1.extract, 0, 1
1496   %.fca.0.2.insert = insertvalue %struct.float64x1x4_t %.fca.0.1.insert, <1 x double> %vld3_lane.fca.2.extract, 0, 2
1497   %.fca.0.3.insert = insertvalue %struct.float64x1x4_t %.fca.0.2.insert, <1 x double> %vld3_lane.fca.3.extract, 0, 3
1498   ret %struct.float64x1x4_t %.fca.0.3.insert
1499 }
1500
1501 define void @test_vst1q_lane_s8(i8* %a, <16 x i8> %b) {
1502 ; CHECK-LABEL: test_vst1q_lane_s8
1503 ; CHECK: st1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1504 entry:
1505   %0 = extractelement <16 x i8> %b, i32 15
1506   store i8 %0, i8* %a, align 1
1507   ret void
1508 }
1509
1510 define void @test_vst1q_lane_s16(i16* %a, <8 x i16> %b) {
1511 ; CHECK-LABEL: test_vst1q_lane_s16
1512 ; CHECK: st1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1513 entry:
1514   %0 = extractelement <8 x i16> %b, i32 7
1515   store i16 %0, i16* %a, align 2
1516   ret void
1517 }
1518
1519 define void @test_vst1q_lane_s32(i32* %a, <4 x i32> %b) {
1520 ; CHECK-LABEL: test_vst1q_lane_s32
1521 ; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1522 entry:
1523   %0 = extractelement <4 x i32> %b, i32 3
1524   store i32 %0, i32* %a, align 4
1525   ret void
1526 }
1527
1528 define void @test_vst1q_lane_s64(i64* %a, <2 x i64> %b) {
1529 ; CHECK-LABEL: test_vst1q_lane_s64
1530 ; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1531 entry:
1532   %0 = extractelement <2 x i64> %b, i32 1
1533   store i64 %0, i64* %a, align 8
1534   ret void
1535 }
1536
1537 define void @test_vst1q_lane_f32(float* %a, <4 x float> %b) {
1538 ; CHECK-LABEL: test_vst1q_lane_f32
1539 ; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1540 entry:
1541   %0 = extractelement <4 x float> %b, i32 3
1542   store float %0, float* %a, align 4
1543   ret void
1544 }
1545
1546 define void @test_vst1q_lane_f64(double* %a, <2 x double> %b) {
1547 ; CHECK-LABEL: test_vst1q_lane_f64
1548 ; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1549 entry:
1550   %0 = extractelement <2 x double> %b, i32 1
1551   store double %0, double* %a, align 8
1552   ret void
1553 }
1554
1555 define void @test_vst1_lane_s8(i8* %a, <8 x i8> %b) {
1556 ; CHECK-LABEL: test_vst1_lane_s8
1557 ; CHECK: st1 {{{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1558 entry:
1559   %0 = extractelement <8 x i8> %b, i32 7
1560   store i8 %0, i8* %a, align 1
1561   ret void
1562 }
1563
1564 define void @test_vst1_lane_s16(i16* %a, <4 x i16> %b) {
1565 ; CHECK-LABEL: test_vst1_lane_s16
1566 ; CHECK: st1 {{{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1567 entry:
1568   %0 = extractelement <4 x i16> %b, i32 3
1569   store i16 %0, i16* %a, align 2
1570   ret void
1571 }
1572
1573 define void @test_vst1_lane_s32(i32* %a, <2 x i32> %b) {
1574 ; CHECK-LABEL: test_vst1_lane_s32
1575 ; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1576 entry:
1577   %0 = extractelement <2 x i32> %b, i32 1
1578   store i32 %0, i32* %a, align 4
1579   ret void
1580 }
1581
1582 define void @test_vst1_lane_s64(i64* %a, <1 x i64> %b) {
1583 ; CHECK-LABEL: test_vst1_lane_s64
1584 ; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1585 entry:
1586   %0 = extractelement <1 x i64> %b, i32 0
1587   store i64 %0, i64* %a, align 8
1588   ret void
1589 }
1590
1591 define void @test_vst1_lane_f32(float* %a, <2 x float> %b) {
1592 ; CHECK-LABEL: test_vst1_lane_f32
1593 ; CHECK: st1 {{{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1594 entry:
1595   %0 = extractelement <2 x float> %b, i32 1
1596   store float %0, float* %a, align 4
1597   ret void
1598 }
1599
1600 define void @test_vst1_lane_f64(double* %a, <1 x double> %b) {
1601 ; CHECK-LABEL: test_vst1_lane_f64
1602 ; CHECK: st1 {{{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1603 entry:
1604   %0 = extractelement <1 x double> %b, i32 0
1605   store double %0, double* %a, align 8
1606   ret void
1607 }
1608
1609 define void @test_vst2q_lane_s8(i8* %a, [2 x <16 x i8>] %b.coerce) {
1610 ; CHECK-LABEL: test_vst2q_lane_s8
1611 ; CHECK: st2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1612 entry:
1613   %b.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %b.coerce, 0
1614   %b.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %b.coerce, 1
1615   tail call void @llvm.arm.neon.vst2lane.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, i32 15, i32 1)
1616   ret void
1617 }
1618
1619 define void @test_vst2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) {
1620 ; CHECK-LABEL: test_vst2q_lane_s16
1621 ; CHECK: st2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1622 entry:
1623   %b.coerce.fca.0.extract = extractvalue [2 x <8 x i16>] %b.coerce, 0
1624   %b.coerce.fca.1.extract = extractvalue [2 x <8 x i16>] %b.coerce, 1
1625   %0 = bitcast i16* %a to i8*
1626   tail call void @llvm.arm.neon.vst2lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, i32 7, i32 2)
1627   ret void
1628 }
1629
1630 define void @test_vst2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) {
1631 ; CHECK-LABEL: test_vst2q_lane_s32
1632 ; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1633 entry:
1634   %b.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %b.coerce, 0
1635   %b.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %b.coerce, 1
1636   %0 = bitcast i32* %a to i8*
1637   tail call void @llvm.arm.neon.vst2lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, i32 3, i32 4)
1638   ret void
1639 }
1640
1641 define void @test_vst2q_lane_s64(i64* %a, [2 x <2 x i64>] %b.coerce) {
1642 ; CHECK-LABEL: test_vst2q_lane_s64
1643 ; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1644 entry:
1645   %b.coerce.fca.0.extract = extractvalue [2 x <2 x i64>] %b.coerce, 0
1646   %b.coerce.fca.1.extract = extractvalue [2 x <2 x i64>] %b.coerce, 1
1647   %0 = bitcast i64* %a to i8*
1648   tail call void @llvm.arm.neon.vst2lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, i32 1, i32 8)
1649   ret void
1650 }
1651
1652 define void @test_vst2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) {
1653 ; CHECK-LABEL: test_vst2q_lane_f32
1654 ; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1655 entry:
1656   %b.coerce.fca.0.extract = extractvalue [2 x <4 x float>] %b.coerce, 0
1657   %b.coerce.fca.1.extract = extractvalue [2 x <4 x float>] %b.coerce, 1
1658   %0 = bitcast float* %a to i8*
1659   tail call void @llvm.arm.neon.vst2lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, i32 3, i32 4)
1660   ret void
1661 }
1662
1663 define void @test_vst2q_lane_f64(double* %a, [2 x <2 x double>] %b.coerce) {
1664 ; CHECK-LABEL: test_vst2q_lane_f64
1665 ; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1666 entry:
1667   %b.coerce.fca.0.extract = extractvalue [2 x <2 x double>] %b.coerce, 0
1668   %b.coerce.fca.1.extract = extractvalue [2 x <2 x double>] %b.coerce, 1
1669   %0 = bitcast double* %a to i8*
1670   tail call void @llvm.arm.neon.vst2lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, i32 1, i32 8)
1671   ret void
1672 }
1673
1674 define void @test_vst2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) {
1675 ; CHECK-LABEL: test_vst2_lane_s8
1676 ; CHECK: st2 {{{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1677 entry:
1678   %b.coerce.fca.0.extract = extractvalue [2 x <8 x i8>] %b.coerce, 0
1679   %b.coerce.fca.1.extract = extractvalue [2 x <8 x i8>] %b.coerce, 1
1680   tail call void @llvm.arm.neon.vst2lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, i32 7, i32 1)
1681   ret void
1682 }
1683
1684 define void @test_vst2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) {
1685 ; CHECK-LABEL: test_vst2_lane_s16
1686 ; CHECK: st2 {{{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1687 entry:
1688   %b.coerce.fca.0.extract = extractvalue [2 x <4 x i16>] %b.coerce, 0
1689   %b.coerce.fca.1.extract = extractvalue [2 x <4 x i16>] %b.coerce, 1
1690   %0 = bitcast i16* %a to i8*
1691   tail call void @llvm.arm.neon.vst2lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, i32 3, i32 2)
1692   ret void
1693 }
1694
1695 define void @test_vst2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) {
1696 ; CHECK-LABEL: test_vst2_lane_s32
1697 ; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1698 entry:
1699   %b.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %b.coerce, 0
1700   %b.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %b.coerce, 1
1701   %0 = bitcast i32* %a to i8*
1702   tail call void @llvm.arm.neon.vst2lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, i32 1, i32 4)
1703   ret void
1704 }
1705
1706 define void @test_vst2_lane_s64(i64* %a, [2 x <1 x i64>] %b.coerce) {
1707 ; CHECK-LABEL: test_vst2_lane_s64
1708 ; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1709 entry:
1710   %b.coerce.fca.0.extract = extractvalue [2 x <1 x i64>] %b.coerce, 0
1711   %b.coerce.fca.1.extract = extractvalue [2 x <1 x i64>] %b.coerce, 1
1712   %0 = bitcast i64* %a to i8*
1713   tail call void @llvm.arm.neon.vst2lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, i32 0, i32 8)
1714   ret void
1715 }
1716
1717 define void @test_vst2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) {
1718 ; CHECK-LABEL: test_vst2_lane_f32
1719 ; CHECK: st2 {{{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1720 entry:
1721   %b.coerce.fca.0.extract = extractvalue [2 x <2 x float>] %b.coerce, 0
1722   %b.coerce.fca.1.extract = extractvalue [2 x <2 x float>] %b.coerce, 1
1723   %0 = bitcast float* %a to i8*
1724   tail call void @llvm.arm.neon.vst2lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, i32 1, i32 4)
1725   ret void
1726 }
1727
1728 define void @test_vst2_lane_f64(double* %a, [2 x <1 x double>] %b.coerce) {
1729 ; CHECK-LABEL: test_vst2_lane_f64
1730 ; CHECK: st2 {{{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1731 entry:
1732   %b.coerce.fca.0.extract = extractvalue [2 x <1 x double>] %b.coerce, 0
1733   %b.coerce.fca.1.extract = extractvalue [2 x <1 x double>] %b.coerce, 1
1734   %0 = bitcast double* %a to i8*
1735   tail call void @llvm.arm.neon.vst2lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, i32 0, i32 8)
1736   ret void
1737 }
1738
1739 define void @test_vst3q_lane_s8(i8* %a, [3 x <16 x i8>] %b.coerce) {
1740 ; CHECK-LABEL: test_vst3q_lane_s8
1741 ; CHECK: st3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1742 entry:
1743   %b.coerce.fca.0.extract = extractvalue [3 x <16 x i8>] %b.coerce, 0
1744   %b.coerce.fca.1.extract = extractvalue [3 x <16 x i8>] %b.coerce, 1
1745   %b.coerce.fca.2.extract = extractvalue [3 x <16 x i8>] %b.coerce, 2
1746   tail call void @llvm.arm.neon.vst3lane.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, i32 15, i32 1)
1747   ret void
1748 }
1749
1750 define void @test_vst3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) {
1751 ; CHECK-LABEL: test_vst3q_lane_s16
1752 ; CHECK: st3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1753 entry:
1754   %b.coerce.fca.0.extract = extractvalue [3 x <8 x i16>] %b.coerce, 0
1755   %b.coerce.fca.1.extract = extractvalue [3 x <8 x i16>] %b.coerce, 1
1756   %b.coerce.fca.2.extract = extractvalue [3 x <8 x i16>] %b.coerce, 2
1757   %0 = bitcast i16* %a to i8*
1758   tail call void @llvm.arm.neon.vst3lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, i32 7, i32 2)
1759   ret void
1760 }
1761
1762 define void @test_vst3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) {
1763 ; CHECK-LABEL: test_vst3q_lane_s32
1764 ; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1765 entry:
1766   %b.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %b.coerce, 0
1767   %b.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %b.coerce, 1
1768   %b.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %b.coerce, 2
1769   %0 = bitcast i32* %a to i8*
1770   tail call void @llvm.arm.neon.vst3lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, i32 3, i32 4)
1771   ret void
1772 }
1773
1774 define void @test_vst3q_lane_s64(i64* %a, [3 x <2 x i64>] %b.coerce) {
1775 ; CHECK-LABEL: test_vst3q_lane_s64
1776 ; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1777 entry:
1778   %b.coerce.fca.0.extract = extractvalue [3 x <2 x i64>] %b.coerce, 0
1779   %b.coerce.fca.1.extract = extractvalue [3 x <2 x i64>] %b.coerce, 1
1780   %b.coerce.fca.2.extract = extractvalue [3 x <2 x i64>] %b.coerce, 2
1781   %0 = bitcast i64* %a to i8*
1782   tail call void @llvm.arm.neon.vst3lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, i32 1, i32 8)
1783   ret void
1784 }
1785
1786 define void @test_vst3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) {
1787 ; CHECK-LABEL: test_vst3q_lane_f32
1788 ; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1789 entry:
1790   %b.coerce.fca.0.extract = extractvalue [3 x <4 x float>] %b.coerce, 0
1791   %b.coerce.fca.1.extract = extractvalue [3 x <4 x float>] %b.coerce, 1
1792   %b.coerce.fca.2.extract = extractvalue [3 x <4 x float>] %b.coerce, 2
1793   %0 = bitcast float* %a to i8*
1794   tail call void @llvm.arm.neon.vst3lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, i32 3, i32 4)
1795   ret void
1796 }
1797
1798 define void @test_vst3q_lane_f64(double* %a, [3 x <2 x double>] %b.coerce) {
1799 ; CHECK-LABEL: test_vst3q_lane_f64
1800 ; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1801 entry:
1802   %b.coerce.fca.0.extract = extractvalue [3 x <2 x double>] %b.coerce, 0
1803   %b.coerce.fca.1.extract = extractvalue [3 x <2 x double>] %b.coerce, 1
1804   %b.coerce.fca.2.extract = extractvalue [3 x <2 x double>] %b.coerce, 2
1805   %0 = bitcast double* %a to i8*
1806   tail call void @llvm.arm.neon.vst3lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, i32 1, i32 8)
1807   ret void
1808 }
1809
1810 define void @test_vst3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) {
1811 ; CHECK-LABEL: test_vst3_lane_s8
1812 ; CHECK: st3 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1813 entry:
1814   %b.coerce.fca.0.extract = extractvalue [3 x <8 x i8>] %b.coerce, 0
1815   %b.coerce.fca.1.extract = extractvalue [3 x <8 x i8>] %b.coerce, 1
1816   %b.coerce.fca.2.extract = extractvalue [3 x <8 x i8>] %b.coerce, 2
1817   tail call void @llvm.arm.neon.vst3lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, i32 7, i32 1)
1818   ret void
1819 }
1820
1821 define void @test_vst3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) {
1822 ; CHECK-LABEL: test_vst3_lane_s16
1823 ; CHECK: st3 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1824 entry:
1825   %b.coerce.fca.0.extract = extractvalue [3 x <4 x i16>] %b.coerce, 0
1826   %b.coerce.fca.1.extract = extractvalue [3 x <4 x i16>] %b.coerce, 1
1827   %b.coerce.fca.2.extract = extractvalue [3 x <4 x i16>] %b.coerce, 2
1828   %0 = bitcast i16* %a to i8*
1829   tail call void @llvm.arm.neon.vst3lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, i32 3, i32 2)
1830   ret void
1831 }
1832
1833 define void @test_vst3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) {
1834 ; CHECK-LABEL: test_vst3_lane_s32
1835 ; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1836 entry:
1837   %b.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %b.coerce, 0
1838   %b.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %b.coerce, 1
1839   %b.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %b.coerce, 2
1840   %0 = bitcast i32* %a to i8*
1841   tail call void @llvm.arm.neon.vst3lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, i32 1, i32 4)
1842   ret void
1843 }
1844
1845 define void @test_vst3_lane_s64(i64* %a, [3 x <1 x i64>] %b.coerce) {
1846 ; CHECK-LABEL: test_vst3_lane_s64
1847 ; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1848 entry:
1849   %b.coerce.fca.0.extract = extractvalue [3 x <1 x i64>] %b.coerce, 0
1850   %b.coerce.fca.1.extract = extractvalue [3 x <1 x i64>] %b.coerce, 1
1851   %b.coerce.fca.2.extract = extractvalue [3 x <1 x i64>] %b.coerce, 2
1852   %0 = bitcast i64* %a to i8*
1853   tail call void @llvm.arm.neon.vst3lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, i32 0, i32 8)
1854   ret void
1855 }
1856
1857 define void @test_vst3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) {
1858 ; CHECK-LABEL: test_vst3_lane_f32
1859 ; CHECK: st3 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1860 entry:
1861   %b.coerce.fca.0.extract = extractvalue [3 x <2 x float>] %b.coerce, 0
1862   %b.coerce.fca.1.extract = extractvalue [3 x <2 x float>] %b.coerce, 1
1863   %b.coerce.fca.2.extract = extractvalue [3 x <2 x float>] %b.coerce, 2
1864   %0 = bitcast float* %a to i8*
1865   tail call void @llvm.arm.neon.vst3lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, i32 1, i32 4)
1866   ret void
1867 }
1868
1869 define void @test_vst3_lane_f64(double* %a, [3 x <1 x double>] %b.coerce) {
1870 ; CHECK-LABEL: test_vst3_lane_f64
1871 ; CHECK: st3 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1872 entry:
1873   %b.coerce.fca.0.extract = extractvalue [3 x <1 x double>] %b.coerce, 0
1874   %b.coerce.fca.1.extract = extractvalue [3 x <1 x double>] %b.coerce, 1
1875   %b.coerce.fca.2.extract = extractvalue [3 x <1 x double>] %b.coerce, 2
1876   %0 = bitcast double* %a to i8*
1877   tail call void @llvm.arm.neon.vst3lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, i32 0, i32 8)
1878   ret void
1879 }
1880
1881 define void @test_vst4q_lane_s8(i16* %a, [4 x <16 x i8>] %b.coerce) {
1882 ; CHECK-LABEL: test_vst4q_lane_s8
1883 ; CHECK: st4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1884 entry:
1885   %b.coerce.fca.0.extract = extractvalue [4 x <16 x i8>] %b.coerce, 0
1886   %b.coerce.fca.1.extract = extractvalue [4 x <16 x i8>] %b.coerce, 1
1887   %b.coerce.fca.2.extract = extractvalue [4 x <16 x i8>] %b.coerce, 2
1888   %b.coerce.fca.3.extract = extractvalue [4 x <16 x i8>] %b.coerce, 3
1889   %0 = bitcast i16* %a to i8*
1890   tail call void @llvm.arm.neon.vst4lane.v16i8(i8* %0, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, <16 x i8> %b.coerce.fca.3.extract, i32 15, i32 2)
1891   ret void
1892 }
1893
1894 define void @test_vst4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) {
1895 ; CHECK-LABEL: test_vst4q_lane_s16
1896 ; CHECK: st4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1897 entry:
1898   %b.coerce.fca.0.extract = extractvalue [4 x <8 x i16>] %b.coerce, 0
1899   %b.coerce.fca.1.extract = extractvalue [4 x <8 x i16>] %b.coerce, 1
1900   %b.coerce.fca.2.extract = extractvalue [4 x <8 x i16>] %b.coerce, 2
1901   %b.coerce.fca.3.extract = extractvalue [4 x <8 x i16>] %b.coerce, 3
1902   %0 = bitcast i16* %a to i8*
1903   tail call void @llvm.arm.neon.vst4lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, <8 x i16> %b.coerce.fca.3.extract, i32 7, i32 2)
1904   ret void
1905 }
1906
1907 define void @test_vst4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) {
1908 ; CHECK-LABEL: test_vst4q_lane_s32
1909 ; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1910 entry:
1911   %b.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %b.coerce, 0
1912   %b.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %b.coerce, 1
1913   %b.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %b.coerce, 2
1914   %b.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %b.coerce, 3
1915   %0 = bitcast i32* %a to i8*
1916   tail call void @llvm.arm.neon.vst4lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, <4 x i32> %b.coerce.fca.3.extract, i32 3, i32 4)
1917   ret void
1918 }
1919
1920 define void @test_vst4q_lane_s64(i64* %a, [4 x <2 x i64>] %b.coerce) {
1921 ; CHECK-LABEL: test_vst4q_lane_s64
1922 ; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1923 entry:
1924   %b.coerce.fca.0.extract = extractvalue [4 x <2 x i64>] %b.coerce, 0
1925   %b.coerce.fca.1.extract = extractvalue [4 x <2 x i64>] %b.coerce, 1
1926   %b.coerce.fca.2.extract = extractvalue [4 x <2 x i64>] %b.coerce, 2
1927   %b.coerce.fca.3.extract = extractvalue [4 x <2 x i64>] %b.coerce, 3
1928   %0 = bitcast i64* %a to i8*
1929   tail call void @llvm.arm.neon.vst4lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, <2 x i64> %b.coerce.fca.3.extract, i32 1, i32 8)
1930   ret void
1931 }
1932
1933 define void @test_vst4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) {
1934 ; CHECK-LABEL: test_vst4q_lane_f32
1935 ; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1936 entry:
1937   %b.coerce.fca.0.extract = extractvalue [4 x <4 x float>] %b.coerce, 0
1938   %b.coerce.fca.1.extract = extractvalue [4 x <4 x float>] %b.coerce, 1
1939   %b.coerce.fca.2.extract = extractvalue [4 x <4 x float>] %b.coerce, 2
1940   %b.coerce.fca.3.extract = extractvalue [4 x <4 x float>] %b.coerce, 3
1941   %0 = bitcast float* %a to i8*
1942   tail call void @llvm.arm.neon.vst4lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, <4 x float> %b.coerce.fca.3.extract, i32 3, i32 4)
1943   ret void
1944 }
1945
1946 define void @test_vst4q_lane_f64(double* %a, [4 x <2 x double>] %b.coerce) {
1947 ; CHECK-LABEL: test_vst4q_lane_f64
1948 ; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
1949 entry:
1950   %b.coerce.fca.0.extract = extractvalue [4 x <2 x double>] %b.coerce, 0
1951   %b.coerce.fca.1.extract = extractvalue [4 x <2 x double>] %b.coerce, 1
1952   %b.coerce.fca.2.extract = extractvalue [4 x <2 x double>] %b.coerce, 2
1953   %b.coerce.fca.3.extract = extractvalue [4 x <2 x double>] %b.coerce, 3
1954   %0 = bitcast double* %a to i8*
1955   tail call void @llvm.arm.neon.vst4lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, <2 x double> %b.coerce.fca.3.extract, i32 1, i32 8)
1956   ret void
1957 }
1958
1959 define void @test_vst4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) {
1960 ; CHECK-LABEL: test_vst4_lane_s8
1961 ; CHECK: st4 {{{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b}[{{[0-9]+}}], [x0]
1962 entry:
1963   %b.coerce.fca.0.extract = extractvalue [4 x <8 x i8>] %b.coerce, 0
1964   %b.coerce.fca.1.extract = extractvalue [4 x <8 x i8>] %b.coerce, 1
1965   %b.coerce.fca.2.extract = extractvalue [4 x <8 x i8>] %b.coerce, 2
1966   %b.coerce.fca.3.extract = extractvalue [4 x <8 x i8>] %b.coerce, 3
1967   tail call void @llvm.arm.neon.vst4lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, <8 x i8> %b.coerce.fca.3.extract, i32 7, i32 1)
1968   ret void
1969 }
1970
1971 define void @test_vst4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) {
1972 ; CHECK-LABEL: test_vst4_lane_s16
1973 ; CHECK: st4 {{{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h}[{{[0-9]+}}], [x0]
1974 entry:
1975   %b.coerce.fca.0.extract = extractvalue [4 x <4 x i16>] %b.coerce, 0
1976   %b.coerce.fca.1.extract = extractvalue [4 x <4 x i16>] %b.coerce, 1
1977   %b.coerce.fca.2.extract = extractvalue [4 x <4 x i16>] %b.coerce, 2
1978   %b.coerce.fca.3.extract = extractvalue [4 x <4 x i16>] %b.coerce, 3
1979   %0 = bitcast i16* %a to i8*
1980   tail call void @llvm.arm.neon.vst4lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, <4 x i16> %b.coerce.fca.3.extract, i32 3, i32 2)
1981   ret void
1982 }
1983
1984 define void @test_vst4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) {
1985 ; CHECK-LABEL: test_vst4_lane_s32
1986 ; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
1987 entry:
1988   %b.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %b.coerce, 0
1989   %b.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %b.coerce, 1
1990   %b.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %b.coerce, 2
1991   %b.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %b.coerce, 3
1992   %0 = bitcast i32* %a to i8*
1993   tail call void @llvm.arm.neon.vst4lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, <2 x i32> %b.coerce.fca.3.extract, i32 1, i32 4)
1994   ret void
1995 }
1996
1997 define void @test_vst4_lane_s64(i64* %a, [4 x <1 x i64>] %b.coerce) {
1998 ; CHECK-LABEL: test_vst4_lane_s64
1999 ; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
2000 entry:
2001   %b.coerce.fca.0.extract = extractvalue [4 x <1 x i64>] %b.coerce, 0
2002   %b.coerce.fca.1.extract = extractvalue [4 x <1 x i64>] %b.coerce, 1
2003   %b.coerce.fca.2.extract = extractvalue [4 x <1 x i64>] %b.coerce, 2
2004   %b.coerce.fca.3.extract = extractvalue [4 x <1 x i64>] %b.coerce, 3
2005   %0 = bitcast i64* %a to i8*
2006   tail call void @llvm.arm.neon.vst4lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, <1 x i64> %b.coerce.fca.3.extract, i32 0, i32 8)
2007   ret void
2008 }
2009
2010 define void @test_vst4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) {
2011 ; CHECK-LABEL: test_vst4_lane_f32
2012 ; CHECK: st4 {{{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s}[{{[0-9]+}}], [x0]
2013 entry:
2014   %b.coerce.fca.0.extract = extractvalue [4 x <2 x float>] %b.coerce, 0
2015   %b.coerce.fca.1.extract = extractvalue [4 x <2 x float>] %b.coerce, 1
2016   %b.coerce.fca.2.extract = extractvalue [4 x <2 x float>] %b.coerce, 2
2017   %b.coerce.fca.3.extract = extractvalue [4 x <2 x float>] %b.coerce, 3
2018   %0 = bitcast float* %a to i8*
2019   tail call void @llvm.arm.neon.vst4lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, <2 x float> %b.coerce.fca.3.extract, i32 1, i32 4)
2020   ret void
2021 }
2022
2023 define void @test_vst4_lane_f64(double* %a, [4 x <1 x double>] %b.coerce) {
2024 ; CHECK-LABEL: test_vst4_lane_f64
2025 ; CHECK: st4 {{{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d}[{{[0-9]+}}], [x0]
2026 entry:
2027   %b.coerce.fca.0.extract = extractvalue [4 x <1 x double>] %b.coerce, 0
2028   %b.coerce.fca.1.extract = extractvalue [4 x <1 x double>] %b.coerce, 1
2029   %b.coerce.fca.2.extract = extractvalue [4 x <1 x double>] %b.coerce, 2
2030   %b.coerce.fca.3.extract = extractvalue [4 x <1 x double>] %b.coerce, 3
2031   %0 = bitcast double* %a to i8*
2032   tail call void @llvm.arm.neon.vst4lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, <1 x double> %b.coerce.fca.3.extract, i32 0, i32 8)
2033   ret void
2034 }
2035
2036 declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8*, <16 x i8>, <16 x i8>, i32, i32)
2037 declare { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32)
2038 declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32)
2039 declare { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2lane.v2i64(i8*, <2 x i64>, <2 x i64>, i32, i32)
2040 declare { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32)
2041 declare { <2 x double>, <2 x double> } @llvm.arm.neon.vld2lane.v2f64(i8*, <2 x double>, <2 x double>, i32, i32)
2042 declare { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32)
2043 declare { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32)
2044 declare { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32)
2045 declare { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8*, i32)
2046 declare { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32)
2047 declare { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8*, i32)
2048 declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
2049 declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
2050 declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
2051 declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
2052 declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32)
2053 declare { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32, i32)
2054 declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
2055 declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
2056 declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
2057 declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8*, i32)
2058 declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32)
2059 declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8*, i32)
2060 declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
2061 declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
2062 declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
2063 declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
2064 declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32)
2065 declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32)
2066 declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
2067 declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
2068 declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
2069 declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8*, i32)
2070 declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32)
2071 declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8*, i32)
2072 declare { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2lane.v1i64(i8*, <1 x i64>, <1 x i64>, i32, i32)
2073 declare { <1 x double>, <1 x double> } @llvm.arm.neon.vld2lane.v1f64(i8*, <1 x double>, <1 x double>, i32, i32)
2074 declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
2075 declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32, i32)
2076 declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
2077 declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32, i32)
2078 declare void @llvm.arm.neon.vst2lane.v16i8(i8*, <16 x i8>, <16 x i8>, i32, i32)
2079 declare void @llvm.arm.neon.vst2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32)
2080 declare void @llvm.arm.neon.vst2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32)
2081 declare void @llvm.arm.neon.vst2lane.v2i64(i8*, <2 x i64>, <2 x i64>, i32, i32)
2082 declare void @llvm.arm.neon.vst2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32)
2083 declare void @llvm.arm.neon.vst2lane.v2f64(i8*, <2 x double>, <2 x double>, i32, i32)
2084 declare void @llvm.arm.neon.vst2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32)
2085 declare void @llvm.arm.neon.vst2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32)
2086 declare void @llvm.arm.neon.vst2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32)
2087 declare void @llvm.arm.neon.vst2lane.v1i64(i8*, <1 x i64>, <1 x i64>, i32, i32)
2088 declare void @llvm.arm.neon.vst2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32)
2089 declare void @llvm.arm.neon.vst2lane.v1f64(i8*, <1 x double>, <1 x double>, i32, i32)
2090 declare void @llvm.arm.neon.vst3lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
2091 declare void @llvm.arm.neon.vst3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
2092 declare void @llvm.arm.neon.vst3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
2093 declare void @llvm.arm.neon.vst3lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
2094 declare void @llvm.arm.neon.vst3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32)
2095 declare void @llvm.arm.neon.vst3lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32, i32)
2096 declare void @llvm.arm.neon.vst3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
2097 declare void @llvm.arm.neon.vst3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
2098 declare void @llvm.arm.neon.vst3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
2099 declare void @llvm.arm.neon.vst3lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
2100 declare void @llvm.arm.neon.vst3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32)
2101 declare void @llvm.arm.neon.vst3lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32, i32)
2102 declare void @llvm.arm.neon.vst4lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
2103 declare void @llvm.arm.neon.vst4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
2104 declare void @llvm.arm.neon.vst4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
2105 declare void @llvm.arm.neon.vst4lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
2106 declare void @llvm.arm.neon.vst4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32)
2107 declare void @llvm.arm.neon.vst4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32)
2108 declare void @llvm.arm.neon.vst4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
2109 declare void @llvm.arm.neon.vst4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
2110 declare void @llvm.arm.neon.vst4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
2111 declare void @llvm.arm.neon.vst4lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
2112 declare void @llvm.arm.neon.vst4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32)
2113 declare void @llvm.arm.neon.vst4lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32, i32)