test/CodeGen/AArch64/neon-simd-ldst-one.ll

   1 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
   2 ; interesting parts copied into arm64 directory as aarch64-neon-simd-ldst-one.ll
   3
   4 %struct.uint8x16x2_t = type { [2 x <16 x i8>] }
   5 %struct.poly8x16x2_t = type { [2 x <16 x i8>] }
   6 %struct.uint8x16x3_t = type { [3 x <16 x i8>] }
   7 %struct.int8x16x2_t = type { [2 x <16 x i8>] }
   8 %struct.int16x8x2_t = type { [2 x <8 x i16>] }
   9 %struct.int32x4x2_t = type { [2 x <4 x i32>] }
  10 %struct.int64x2x2_t = type { [2 x <2 x i64>] }
  11 %struct.float32x4x2_t = type { [2 x <4 x float>] }
  12 %struct.float64x2x2_t = type { [2 x <2 x double>] }
  13 %struct.int8x8x2_t = type { [2 x <8 x i8>] }
  14 %struct.int16x4x2_t = type { [2 x <4 x i16>] }
  15 %struct.int32x2x2_t = type { [2 x <2 x i32>] }
  16 %struct.int64x1x2_t = type { [2 x <1 x i64>] }
  17 %struct.float32x2x2_t = type { [2 x <2 x float>] }
  18 %struct.float64x1x2_t = type { [2 x <1 x double>] }
  19 %struct.int8x16x3_t = type { [3 x <16 x i8>] }
  20 %struct.int16x8x3_t = type { [3 x <8 x i16>] }
  21 %struct.int32x4x3_t = type { [3 x <4 x i32>] }
  22 %struct.int64x2x3_t = type { [3 x <2 x i64>] }
  23 %struct.float32x4x3_t = type { [3 x <4 x float>] }
  24 %struct.float64x2x3_t = type { [3 x <2 x double>] }
  25 %struct.int8x8x3_t = type { [3 x <8 x i8>] }
  26 %struct.int16x4x3_t = type { [3 x <4 x i16>] }
  27 %struct.int32x2x3_t = type { [3 x <2 x i32>] }
  28 %struct.int64x1x3_t = type { [3 x <1 x i64>] }
  29 %struct.float32x2x3_t = type { [3 x <2 x float>] }
  30 %struct.float64x1x3_t = type { [3 x <1 x double>] }
  31 %struct.int8x16x4_t = type { [4 x <16 x i8>] }
  32 %struct.int16x8x4_t = type { [4 x <8 x i16>] }
  33 %struct.int32x4x4_t = type { [4 x <4 x i32>] }
  34 %struct.int64x2x4_t = type { [4 x <2 x i64>] }
  35 %struct.float32x4x4_t = type { [4 x <4 x float>] }
  36 %struct.float64x2x4_t = type { [4 x <2 x double>] }
  37 %struct.int8x8x4_t = type { [4 x <8 x i8>] }
  38 %struct.int16x4x4_t = type { [4 x <4 x i16>] }
  39 %struct.int32x2x4_t = type { [4 x <2 x i32>] }
  40 %struct.int64x1x4_t = type { [4 x <1 x i64>] }
  41 %struct.float32x2x4_t = type { [4 x <2 x float>] }
  42 %struct.float64x1x4_t = type { [4 x <1 x double>] }
  43
  44 define <16 x i8> @test_ld_from_poll_v16i8(<16 x i8> %a) {
  45 ; CHECK-LABEL: test_ld_from_poll_v16i8
  46 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
  47 ; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
  48 entry:
  49   %b = add <16 x i8> %a, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10, i8 11, i8 2, i8 13, i8 14, i8 15, i8 16>
  50   ret <16 x i8> %b
  51 }
  52
  53 define <8 x i16> @test_ld_from_poll_v8i16(<8 x i16> %a) {
  54 ; CHECK-LABEL: test_ld_from_poll_v8i16
  55 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
  56 ; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
  57 entry:
  58   %b = add <8 x i16> %a, <i16 1, i16 2, i16 3, i16 4, i16 5, i16 6, i16 7, i16 8>
  59   ret <8 x i16> %b
  60 }
  61
  62 define <4 x i32> @test_ld_from_poll_v4i32(<4 x i32> %a) {
  63 ; CHECK-LABEL: test_ld_from_poll_v4i32
  64 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
  65 ; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
  66 entry:
  67   %b = add <4 x i32> %a, <i32 1, i32 2, i32 3, i32 4>
  68   ret <4 x i32> %b
  69 }
  70
  71 define <2 x i64> @test_ld_from_poll_v2i64(<2 x i64> %a) {
  72 ; CHECK-LABEL: test_ld_from_poll_v2i64
  73 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
  74 ; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
  75 entry:
  76   %b = add <2 x i64> %a, <i64 1, i64 2>
  77   ret <2 x i64> %b
  78 }
  79
  80 define <4 x float> @test_ld_from_poll_v4f32(<4 x float> %a) {
  81 ; CHECK-LABEL: test_ld_from_poll_v4f32
  82 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
  83 ; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
  84 entry:
  85   %b = fadd <4 x float> %a, <float 1.0, float 2.0, float 3.0, float 4.0>
  86   ret <4 x float> %b
  87 }
  88
  89 define <2 x double> @test_ld_from_poll_v2f64(<2 x double> %a) {
  90 ; CHECK-LABEL: test_ld_from_poll_v2f64
  91 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
  92 ; CHECK-NEXT: ldr {{q[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
  93 entry:
  94   %b = fadd <2 x double> %a, <double 1.0, double 2.0>
  95   ret <2 x double> %b
  96 }
  97
  98 define <8 x i8> @test_ld_from_poll_v8i8(<8 x i8> %a) {
  99 ; CHECK-LABEL: test_ld_from_poll_v8i8
 100 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
 101 ; CHECK-NEXT: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
 102 entry:
 103   %b = add <8 x i8> %a, <i8 1, i8 2, i8 3, i8 4, i8 5, i8 6, i8 7, i8 8>
 104   ret <8 x i8> %b
 105 }
 106
 107 define <4 x i16> @test_ld_from_poll_v4i16(<4 x i16> %a) {
 108 ; CHECK-LABEL: test_ld_from_poll_v4i16
 109 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
 110 ; CHECK-NEXT: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
 111 entry:
 112   %b = add <4 x i16> %a, <i16 1, i16 2, i16 3, i16 4>
 113   ret <4 x i16> %b
 114 }
 115
 116 define <2 x i32> @test_ld_from_poll_v2i32(<2 x i32> %a) {
 117 ; CHECK-LABEL: test_ld_from_poll_v2i32
 118 ; CHECK: adrp {{x[0-9]+}}, .{{[A-Z0-9_]+}}
 119 ; CHECK-NEXT: ldr {{d[0-9]+}}, [{{x[0-9]+}}, #:lo12:.{{[A-Z0-9_]+}}]
 120 entry:
 121   %b = add <2 x i32> %a, <i32 1, i32 2>
 122   ret <2 x i32> %b
 123 }
 124
 125 define <16 x i8> @test_vld1q_dup_s8(i8* %a) {
 126 ; CHECK-LABEL: test_vld1q_dup_s8
 127 ; CHECK: ld1r { {{v[0-9]+}}.16b }, [x0]
 128 entry:
 129   %0 = load i8* %a, align 1
 130   %1 = insertelement <16 x i8> undef, i8 %0, i32 0
 131   %lane = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
 132   ret <16 x i8> %lane
 133 }
 134
 135 define <8 x i16> @test_vld1q_dup_s16(i16* %a) {
 136 ; CHECK-LABEL: test_vld1q_dup_s16
 137 ; CHECK: ld1r { {{v[0-9]+}}.8h }, [x0]
 138 entry:
 139   %0 = load i16* %a, align 2
 140   %1 = insertelement <8 x i16> undef, i16 %0, i32 0
 141   %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
 142   ret <8 x i16> %lane
 143 }
 144
 145 define <4 x i32> @test_vld1q_dup_s32(i32* %a) {
 146 ; CHECK-LABEL: test_vld1q_dup_s32
 147 ; CHECK: ld1r { {{v[0-9]+}}.4s }, [x0]
 148 entry:
 149   %0 = load i32* %a, align 4
 150   %1 = insertelement <4 x i32> undef, i32 %0, i32 0
 151   %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
 152   ret <4 x i32> %lane
 153 }
 154
 155 define <2 x i64> @test_vld1q_dup_s64(i64* %a) {
 156 ; CHECK-LABEL: test_vld1q_dup_s64
 157 ; CHECK: ld1r { {{v[0-9]+}}.2d }, [x0]
 158 entry:
 159   %0 = load i64* %a, align 8
 160   %1 = insertelement <2 x i64> undef, i64 %0, i32 0
 161   %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
 162   ret <2 x i64> %lane
 163 }
 164
 165 define <4 x float> @test_vld1q_dup_f32(float* %a) {
 166 ; CHECK-LABEL: test_vld1q_dup_f32
 167 ; CHECK: ld1r { {{v[0-9]+}}.4s }, [x0]
 168 entry:
 169   %0 = load float* %a, align 4
 170   %1 = insertelement <4 x float> undef, float %0, i32 0
 171   %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
 172   ret <4 x float> %lane
 173 }
 174
 175 define <2 x double> @test_vld1q_dup_f64(double* %a) {
 176 ; CHECK-LABEL: test_vld1q_dup_f64
 177 ; CHECK: ld1r { {{v[0-9]+}}.2d }, [x0]
 178 entry:
 179   %0 = load double* %a, align 8
 180   %1 = insertelement <2 x double> undef, double %0, i32 0
 181   %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
 182   ret <2 x double> %lane
 183 }
 184
 185 define <8 x i8> @test_vld1_dup_s8(i8* %a) {
 186 ; CHECK-LABEL: test_vld1_dup_s8
 187 ; CHECK: ld1r { {{v[0-9]+}}.8b }, [x0]
 188 entry:
 189   %0 = load i8* %a, align 1
 190   %1 = insertelement <8 x i8> undef, i8 %0, i32 0
 191   %lane = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
 192   ret <8 x i8> %lane
 193 }
 194
 195 define <4 x i16> @test_vld1_dup_s16(i16* %a) {
 196 ; CHECK-LABEL: test_vld1_dup_s16
 197 ; CHECK: ld1r { {{v[0-9]+}}.4h }, [x0]
 198 entry:
 199   %0 = load i16* %a, align 2
 200   %1 = insertelement <4 x i16> undef, i16 %0, i32 0
 201   %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
 202   ret <4 x i16> %lane
 203 }
 204
 205 define <2 x i32> @test_vld1_dup_s32(i32* %a) {
 206 ; CHECK-LABEL: test_vld1_dup_s32
 207 ; CHECK: ld1r { {{v[0-9]+}}.2s }, [x0]
 208 entry:
 209   %0 = load i32* %a, align 4
 210   %1 = insertelement <2 x i32> undef, i32 %0, i32 0
 211   %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
 212   ret <2 x i32> %lane
 213 }
 214
 215 define <1 x i64> @test_vld1_dup_s64(i64* %a) {
 216 ; CHECK-LABEL: test_vld1_dup_s64
 217 ; CHECK: ld1r { {{v[0-9]+}}.1d }, [x0]
 218 entry:
 219   %0 = load i64* %a, align 8
 220   %1 = insertelement <1 x i64> undef, i64 %0, i32 0
 221   ret <1 x i64> %1
 222 }
 223
 224 define <2 x float> @test_vld1_dup_f32(float* %a) {
 225 ; CHECK-LABEL: test_vld1_dup_f32
 226 ; CHECK: ld1r { {{v[0-9]+}}.2s }, [x0]
 227 entry:
 228   %0 = load float* %a, align 4
 229   %1 = insertelement <2 x float> undef, float %0, i32 0
 230   %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
 231   ret <2 x float> %lane
 232 }
 233
 234 define <1 x double> @test_vld1_dup_f64(double* %a) {
 235 ; CHECK-LABEL: test_vld1_dup_f64
 236 ; CHECK: ld1r { {{v[0-9]+}}.1d }, [x0]
 237 entry:
 238   %0 = load double* %a, align 8
 239   %1 = insertelement <1 x double> undef, double %0, i32 0
 240   ret <1 x double> %1
 241 }
 242
 243 define <1 x i64> @testDUP.v1i64(i64* %a, i64* %b) #0 {
 244 ; As there is a store operation depending on %1, LD1R pattern can't be selected.
 245 ; So LDR and FMOV should be emitted.
 246 ; CHECK-LABEL: testDUP.v1i64
 247 ; CHECK: ldr {{x[0-9]+}}, [{{x[0-9]+}}]
 248 ; CHECK: fmov {{d[0-9]+}}, {{x[0-9]+}}
 249 ; CHECK: str {{x[0-9]+}}, [{{x[0-9]+}}]
 250   %1 = load i64* %a, align 8
 251   store i64 %1, i64* %b, align 8
 252   %vecinit.i = insertelement <1 x i64> undef, i64 %1, i32 0
 253   ret <1 x i64> %vecinit.i
 254 }
 255
 256 define <1 x double> @testDUP.v1f64(double* %a, double* %b) #0 {
 257 ; As there is a store operation depending on %1, LD1R pattern can't be selected.
 258 ; So LDR and FMOV should be emitted.
 259 ; CHECK-LABEL: testDUP.v1f64
 260 ; CHECK: ldr {{d[0-9]+}}, [{{x[0-9]+}}]
 261 ; CHECK: str {{d[0-9]+}}, [{{x[0-9]+}}]
 262   %1 = load double* %a, align 8
 263   store double %1, double* %b, align 8
 264   %vecinit.i = insertelement <1 x double> undef, double %1, i32 0
 265   ret <1 x double> %vecinit.i
 266 }
 267
 268 define %struct.int8x16x2_t @test_vld2q_dup_s8(i8* %a) {
 269 ; CHECK-LABEL: test_vld2q_dup_s8
 270 ; CHECK: ld2r { {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }, [x0]
 271 entry:
 272   %vld_dup = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
 273   %0 = extractvalue { <16 x i8>, <16 x i8> } %vld_dup, 0
 274   %lane = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
 275   %1 = extractvalue { <16 x i8>, <16 x i8> } %vld_dup, 1
 276   %lane1 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
 277   %.fca.0.0.insert = insertvalue %struct.int8x16x2_t undef, <16 x i8> %lane, 0, 0
 278   %.fca.0.1.insert = insertvalue %struct.int8x16x2_t %.fca.0.0.insert, <16 x i8> %lane1, 0, 1
 279   ret %struct.int8x16x2_t %.fca.0.1.insert
 280 }
 281
 282 define %struct.int16x8x2_t @test_vld2q_dup_s16(i16* %a) {
 283 ; CHECK-LABEL: test_vld2q_dup_s16
 284 ; CHECK: ld2r { {{v[0-9]+}}.8h, {{v[0-9]+}}.8h }, [x0]
 285 entry:
 286   %0 = bitcast i16* %a to i8*
 287   %vld_dup = tail call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16(i8* %0, <8 x i16> undef, <8 x i16> undef, i32 0, i32 2)
 288   %1 = extractvalue { <8 x i16>, <8 x i16> } %vld_dup, 0
 289   %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
 290   %2 = extractvalue { <8 x i16>, <8 x i16> } %vld_dup, 1
 291   %lane1 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer
 292   %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %lane, 0, 0
 293   %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %lane1, 0, 1
 294   ret %struct.int16x8x2_t %.fca.0.1.insert
 295 }
 296
 297 define %struct.int32x4x2_t @test_vld2q_dup_s32(i32* %a) {
 298 ; CHECK-LABEL: test_vld2q_dup_s32
 299 ; CHECK: ld2r { {{v[0-9]+}}.4s, {{v[0-9]+}}.4s }, [x0]
 300 entry:
 301   %0 = bitcast i32* %a to i8*
 302   %vld_dup = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %0, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
 303   %1 = extractvalue { <4 x i32>, <4 x i32> } %vld_dup, 0
 304   %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
 305   %2 = extractvalue { <4 x i32>, <4 x i32> } %vld_dup, 1
 306   %lane1 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
 307   %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %lane, 0, 0
 308   %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %lane1, 0, 1
 309   ret %struct.int32x4x2_t %.fca.0.1.insert
 310 }
 311
 312 define %struct.int64x2x2_t @test_vld2q_dup_s64(i64* %a) {
 313 ; CHECK-LABEL: test_vld2q_dup_s64
 314 ; CHECK: ld2r { {{v[0-9]+}}.2d, {{v[0-9]+}}.2d }, [x0]
 315 entry:
 316   %0 = bitcast i64* %a to i8*
 317   %vld_dup = tail call { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2lane.v2i64(i8* %0, <2 x i64> undef, <2 x i64> undef, i32 0, i32 8)
 318   %1 = extractvalue { <2 x i64>, <2 x i64> } %vld_dup, 0
 319   %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
 320   %2 = extractvalue { <2 x i64>, <2 x i64> } %vld_dup, 1
 321   %lane1 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer
 322   %.fca.0.0.insert = insertvalue %struct.int64x2x2_t undef, <2 x i64> %lane, 0, 0
 323   %.fca.0.1.insert = insertvalue %struct.int64x2x2_t %.fca.0.0.insert, <2 x i64> %lane1, 0, 1
 324   ret %struct.int64x2x2_t %.fca.0.1.insert
 325 }
 326
 327 define %struct.float32x4x2_t @test_vld2q_dup_f32(float* %a) {
 328 ; CHECK-LABEL: test_vld2q_dup_f32
 329 ; CHECK: ld2r { {{v[0-9]+}}.4s, {{v[0-9]+}}.4s }, [x0]
 330 entry:
 331   %0 = bitcast float* %a to i8*
 332   %vld_dup = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32(i8* %0, <4 x float> undef, <4 x float> undef, i32 0, i32 4)
 333   %1 = extractvalue { <4 x float>, <4 x float> } %vld_dup, 0
 334   %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
 335   %2 = extractvalue { <4 x float>, <4 x float> } %vld_dup, 1
 336   %lane1 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer
 337   %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %lane, 0, 0
 338   %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %lane1, 0, 1
 339   ret %struct.float32x4x2_t %.fca.0.1.insert
 340 }
 341
 342 define %struct.float64x2x2_t @test_vld2q_dup_f64(double* %a) {
 343 ; CHECK-LABEL: test_vld2q_dup_f64
 344 ; CHECK: ld2r { {{v[0-9]+}}.2d, {{v[0-9]+}}.2d }, [x0]
 345 entry:
 346   %0 = bitcast double* %a to i8*
 347   %vld_dup = tail call { <2 x double>, <2 x double> } @llvm.arm.neon.vld2lane.v2f64(i8* %0, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
 348   %1 = extractvalue { <2 x double>, <2 x double> } %vld_dup, 0
 349   %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
 350   %2 = extractvalue { <2 x double>, <2 x double> } %vld_dup, 1
 351   %lane1 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
 352   %.fca.0.0.insert = insertvalue %struct.float64x2x2_t undef, <2 x double> %lane, 0, 0
 353   %.fca.0.1.insert = insertvalue %struct.float64x2x2_t %.fca.0.0.insert, <2 x double> %lane1, 0, 1
 354   ret %struct.float64x2x2_t %.fca.0.1.insert
 355 }
 356
 357 define %struct.int8x8x2_t @test_vld2_dup_s8(i8* %a) {
 358 ; CHECK-LABEL: test_vld2_dup_s8
 359 ; CHECK: ld2r { {{v[0-9]+}}.8b, {{v[0-9]+}}.8b }, [x0]
 360 entry:
 361   %vld_dup = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
 362   %0 = extractvalue { <8 x i8>, <8 x i8> } %vld_dup, 0
 363   %lane = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> zeroinitializer
 364   %1 = extractvalue { <8 x i8>, <8 x i8> } %vld_dup, 1
 365   %lane1 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
 366   %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %lane, 0, 0
 367   %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %lane1, 0, 1
 368   ret %struct.int8x8x2_t %.fca.0.1.insert
 369 }
 370
 371 define %struct.int16x4x2_t @test_vld2_dup_s16(i16* %a) {
 372 ; CHECK-LABEL: test_vld2_dup_s16
 373 ; CHECK: ld2r { {{v[0-9]+}}.4h, {{v[0-9]+}}.4h }, [x0]
 374 entry:
 375   %0 = bitcast i16* %a to i8*
 376   %vld_dup = tail call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16(i8* %0, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
 377   %1 = extractvalue { <4 x i16>, <4 x i16> } %vld_dup, 0
 378   %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
 379   %2 = extractvalue { <4 x i16>, <4 x i16> } %vld_dup, 1
 380   %lane1 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> zeroinitializer
 381   %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %lane, 0, 0
 382   %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %lane1, 0, 1
 383   ret %struct.int16x4x2_t %.fca.0.1.insert
 384 }
 385
 386 define %struct.int32x2x2_t @test_vld2_dup_s32(i32* %a) {
 387 ; CHECK-LABEL: test_vld2_dup_s32
 388 ; CHECK: ld2r { {{v[0-9]+}}.2s, {{v[0-9]+}}.2s }, [x0]
 389 entry:
 390   %0 = bitcast i32* %a to i8*
 391   %vld_dup = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32(i8* %0, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
 392   %1 = extractvalue { <2 x i32>, <2 x i32> } %vld_dup, 0
 393   %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
 394   %2 = extractvalue { <2 x i32>, <2 x i32> } %vld_dup, 1
 395   %lane1 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer
 396   %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %lane, 0, 0
 397   %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %lane1, 0, 1
 398   ret %struct.int32x2x2_t %.fca.0.1.insert
 399 }
 400
 401 define %struct.int64x1x2_t @test_vld2_dup_s64(i64* %a) {
 402 ; CHECK-LABEL: test_vld2_dup_s64
 403 ; CHECK: ld1 { {{v[0-9]+}}.1d, {{v[0-9]+}}.1d }, [x0]
 404 entry:
 405   %0 = bitcast i64* %a to i8*
 406   %vld_dup = tail call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8* %0, i32 8)
 407   %vld_dup.fca.0.extract = extractvalue { <1 x i64>, <1 x i64> } %vld_dup, 0
 408   %vld_dup.fca.1.extract = extractvalue { <1 x i64>, <1 x i64> } %vld_dup, 1
 409   %.fca.0.0.insert = insertvalue %struct.int64x1x2_t undef, <1 x i64> %vld_dup.fca.0.extract, 0, 0
 410   %.fca.0.1.insert = insertvalue %struct.int64x1x2_t %.fca.0.0.insert, <1 x i64> %vld_dup.fca.1.extract, 0, 1
 411   ret %struct.int64x1x2_t %.fca.0.1.insert
 412 }
 413
 414 define %struct.float32x2x2_t @test_vld2_dup_f32(float* %a) {
 415 ; CHECK-LABEL: test_vld2_dup_f32
 416 ; CHECK: ld2r { {{v[0-9]+}}.2s, {{v[0-9]+}}.2s }, [x0]
 417 entry:
 418   %0 = bitcast float* %a to i8*
 419   %vld_dup = tail call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32(i8* %0, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
 420   %1 = extractvalue { <2 x float>, <2 x float> } %vld_dup, 0
 421   %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
 422   %2 = extractvalue { <2 x float>, <2 x float> } %vld_dup, 1
 423   %lane1 = shufflevector <2 x float> %2, <2 x float> undef, <2 x i32> zeroinitializer
 424   %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %lane, 0, 0
 425   %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %lane1, 0, 1
 426   ret %struct.float32x2x2_t %.fca.0.1.insert
 427 }
 428
 429 define %struct.float64x1x2_t @test_vld2_dup_f64(double* %a) {
 430 ; CHECK-LABEL: test_vld2_dup_f64
 431 ; CHECK: ld1 { {{v[0-9]+}}.1d, {{v[0-9]+}}.1d }, [x0]
 432 entry:
 433   %0 = bitcast double* %a to i8*
 434   %vld_dup = tail call { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8* %0, i32 8)
 435   %vld_dup.fca.0.extract = extractvalue { <1 x double>, <1 x double> } %vld_dup, 0
 436   %vld_dup.fca.1.extract = extractvalue { <1 x double>, <1 x double> } %vld_dup, 1
 437   %.fca.0.0.insert = insertvalue %struct.float64x1x2_t undef, <1 x double> %vld_dup.fca.0.extract, 0, 0
 438   %.fca.0.1.insert = insertvalue %struct.float64x1x2_t %.fca.0.0.insert, <1 x double> %vld_dup.fca.1.extract, 0, 1
 439   ret %struct.float64x1x2_t %.fca.0.1.insert
 440 }
 441
 442 define %struct.int8x16x3_t @test_vld3q_dup_s8(i8* %a) {
 443 ; CHECK-LABEL: test_vld3q_dup_s8
 444 ; CHECK: ld3r { {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }, [x0]
 445 entry:
 446   %vld_dup = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
 447   %0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 0
 448   %lane = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
 449   %1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 1
 450   %lane1 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
 451   %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 2
 452   %lane2 = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
 453   %.fca.0.0.insert = insertvalue %struct.int8x16x3_t undef, <16 x i8> %lane, 0, 0
 454   %.fca.0.1.insert = insertvalue %struct.int8x16x3_t %.fca.0.0.insert, <16 x i8> %lane1, 0, 1
 455   %.fca.0.2.insert = insertvalue %struct.int8x16x3_t %.fca.0.1.insert, <16 x i8> %lane2, 0, 2
 456   ret %struct.int8x16x3_t %.fca.0.2.insert
 457 }
 458
 459 define %struct.int16x8x3_t @test_vld3q_dup_s16(i16* %a) {
 460 ; CHECK-LABEL: test_vld3q_dup_s16
 461 ; CHECK: ld3r { {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h }, [x0]
 462 entry:
 463   %0 = bitcast i16* %a to i8*
 464   %vld_dup = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8* %0, <8 x i16> undef, <8 x i16> undef, <8 x i16> undef, i32 0, i32 2)
 465   %1 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 0
 466   %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
 467   %2 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 1
 468   %lane1 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer
 469   %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 2
 470   %lane2 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> zeroinitializer
 471   %.fca.0.0.insert = insertvalue %struct.int16x8x3_t undef, <8 x i16> %lane, 0, 0
 472   %.fca.0.1.insert = insertvalue %struct.int16x8x3_t %.fca.0.0.insert, <8 x i16> %lane1, 0, 1
 473   %.fca.0.2.insert = insertvalue %struct.int16x8x3_t %.fca.0.1.insert, <8 x i16> %lane2, 0, 2
 474   ret %struct.int16x8x3_t %.fca.0.2.insert
 475 }
 476
 477 define %struct.int32x4x3_t @test_vld3q_dup_s32(i32* %a) {
 478 ; CHECK-LABEL: test_vld3q_dup_s32
 479 ; CHECK: ld3r { {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s }, [x0]
 480 entry:
 481   %0 = bitcast i32* %a to i8*
 482   %vld_dup = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8* %0, <4 x i32> undef, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
 483   %1 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 0
 484   %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
 485   %2 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 1
 486   %lane1 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
 487   %3 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 2
 488   %lane2 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer
 489   %.fca.0.0.insert = insertvalue %struct.int32x4x3_t undef, <4 x i32> %lane, 0, 0
 490   %.fca.0.1.insert = insertvalue %struct.int32x4x3_t %.fca.0.0.insert, <4 x i32> %lane1, 0, 1
 491   %.fca.0.2.insert = insertvalue %struct.int32x4x3_t %.fca.0.1.insert, <4 x i32> %lane2, 0, 2
 492   ret %struct.int32x4x3_t %.fca.0.2.insert
 493 }
 494
 495 define %struct.int64x2x3_t @test_vld3q_dup_s64(i64* %a) {
 496 ; CHECK-LABEL: test_vld3q_dup_s64
 497 ; CHECK: ld3r { {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d }, [x0]
 498 entry:
 499   %0 = bitcast i64* %a to i8*
 500   %vld_dup = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3lane.v2i64(i8* %0, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, i32 0, i32 8)
 501   %1 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 0
 502   %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
 503   %2 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 1
 504   %lane1 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer
 505   %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 2
 506   %lane2 = shufflevector <2 x i64> %3, <2 x i64> undef, <2 x i32> zeroinitializer
 507   %.fca.0.0.insert = insertvalue %struct.int64x2x3_t undef, <2 x i64> %lane, 0, 0
 508   %.fca.0.1.insert = insertvalue %struct.int64x2x3_t %.fca.0.0.insert, <2 x i64> %lane1, 0, 1
 509   %.fca.0.2.insert = insertvalue %struct.int64x2x3_t %.fca.0.1.insert, <2 x i64> %lane2, 0, 2
 510   ret %struct.int64x2x3_t %.fca.0.2.insert
 511 }
 512
 513 define %struct.float32x4x3_t @test_vld3q_dup_f32(float* %a) {
 514 ; CHECK-LABEL: test_vld3q_dup_f32
 515 ; CHECK: ld3r { {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s }, [x0]
 516 entry:
 517   %0 = bitcast float* %a to i8*
 518   %vld_dup = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32(i8* %0, <4 x float> undef, <4 x float> undef, <4 x float> undef, i32 0, i32 4)
 519   %1 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld_dup, 0
 520   %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
 521   %2 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld_dup, 1
 522   %lane1 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer
 523   %3 = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld_dup, 2
 524   %lane2 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> zeroinitializer
 525   %.fca.0.0.insert = insertvalue %struct.float32x4x3_t undef, <4 x float> %lane, 0, 0
 526   %.fca.0.1.insert = insertvalue %struct.float32x4x3_t %.fca.0.0.insert, <4 x float> %lane1, 0, 1
 527   %.fca.0.2.insert = insertvalue %struct.float32x4x3_t %.fca.0.1.insert, <4 x float> %lane2, 0, 2
 528   ret %struct.float32x4x3_t %.fca.0.2.insert
 529 }
 530
 531 define %struct.float64x2x3_t @test_vld3q_dup_f64(double* %a) {
 532 ; CHECK-LABEL: test_vld3q_dup_f64
 533 ; CHECK: ld3r { {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d }, [x0]
 534 entry:
 535   %0 = bitcast double* %a to i8*
 536   %vld_dup = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3lane.v2f64(i8* %0, <2 x double> undef, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
 537   %1 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld_dup, 0
 538   %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
 539   %2 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld_dup, 1
 540   %lane1 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
 541   %3 = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld_dup, 2
 542   %lane2 = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> zeroinitializer
 543   %.fca.0.0.insert = insertvalue %struct.float64x2x3_t undef, <2 x double> %lane, 0, 0
 544   %.fca.0.1.insert = insertvalue %struct.float64x2x3_t %.fca.0.0.insert, <2 x double> %lane1, 0, 1
 545   %.fca.0.2.insert = insertvalue %struct.float64x2x3_t %.fca.0.1.insert, <2 x double> %lane2, 0, 2
 546   ret %struct.float64x2x3_t %.fca.0.2.insert
 547 }
 548
 549 define %struct.int8x8x3_t @test_vld3_dup_s8(i8* %a) {
 550 ; CHECK-LABEL: test_vld3_dup_s8
 551 ; CHECK: ld3r { {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b }, [x0]
 552 entry:
 553   %vld_dup = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
 554   %0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 0
 555   %lane = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> zeroinitializer
 556   %1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 1
 557   %lane1 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
 558   %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 2
 559   %lane2 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
 560   %.fca.0.0.insert = insertvalue %struct.int8x8x3_t undef, <8 x i8> %lane, 0, 0
 561   %.fca.0.1.insert = insertvalue %struct.int8x8x3_t %.fca.0.0.insert, <8 x i8> %lane1, 0, 1
 562   %.fca.0.2.insert = insertvalue %struct.int8x8x3_t %.fca.0.1.insert, <8 x i8> %lane2, 0, 2
 563   ret %struct.int8x8x3_t %.fca.0.2.insert
 564 }
 565
 566 define %struct.int16x4x3_t @test_vld3_dup_s16(i16* %a) {
 567 ; CHECK-LABEL: test_vld3_dup_s16
 568 ; CHECK: ld3r { {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h }, [x0]
 569 entry:
 570   %0 = bitcast i16* %a to i8*
 571   %vld_dup = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %0, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
 572   %1 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 0
 573   %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
 574   %2 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 1
 575   %lane1 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> zeroinitializer
 576   %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 2
 577   %lane2 = shufflevector <4 x i16> %3, <4 x i16> undef, <4 x i32> zeroinitializer
 578   %.fca.0.0.insert = insertvalue %struct.int16x4x3_t undef, <4 x i16> %lane, 0, 0
 579   %.fca.0.1.insert = insertvalue %struct.int16x4x3_t %.fca.0.0.insert, <4 x i16> %lane1, 0, 1
 580   %.fca.0.2.insert = insertvalue %struct.int16x4x3_t %.fca.0.1.insert, <4 x i16> %lane2, 0, 2
 581   ret %struct.int16x4x3_t %.fca.0.2.insert
 582 }
 583
 584 define %struct.int32x2x3_t @test_vld3_dup_s32(i32* %a) {
 585 ; CHECK-LABEL: test_vld3_dup_s32
 586 ; CHECK: ld3r { {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s }, [x0]
 587 entry:
 588   %0 = bitcast i32* %a to i8*
 589   %vld_dup = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32(i8* %0, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
 590   %1 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 0
 591   %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
 592   %2 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 1
 593   %lane1 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer
 594   %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 2
 595   %lane2 = shufflevector <2 x i32> %3, <2 x i32> undef, <2 x i32> zeroinitializer
 596   %.fca.0.0.insert = insertvalue %struct.int32x2x3_t undef, <2 x i32> %lane, 0, 0
 597   %.fca.0.1.insert = insertvalue %struct.int32x2x3_t %.fca.0.0.insert, <2 x i32> %lane1, 0, 1
 598   %.fca.0.2.insert = insertvalue %struct.int32x2x3_t %.fca.0.1.insert, <2 x i32> %lane2, 0, 2
 599   ret %struct.int32x2x3_t %.fca.0.2.insert
 600 }
 601
 602 define %struct.int64x1x3_t @test_vld3_dup_s64(i64* %a) {
 603 ; CHECK-LABEL: test_vld3_dup_s64
 604 ; CHECK: ld1 { {{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d }, [x0]
 605 entry:
 606   %0 = bitcast i64* %a to i8*
 607   %vld_dup = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8* %0, i32 8)
 608   %vld_dup.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 0
 609   %vld_dup.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 1
 610   %vld_dup.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 2
 611   %.fca.0.0.insert = insertvalue %struct.int64x1x3_t undef, <1 x i64> %vld_dup.fca.0.extract, 0, 0
 612   %.fca.0.1.insert = insertvalue %struct.int64x1x3_t %.fca.0.0.insert, <1 x i64> %vld_dup.fca.1.extract, 0, 1
 613   %.fca.0.2.insert = insertvalue %struct.int64x1x3_t %.fca.0.1.insert, <1 x i64> %vld_dup.fca.2.extract, 0, 2
 614   ret %struct.int64x1x3_t %.fca.0.2.insert
 615 }
 616
 617 define %struct.float32x2x3_t @test_vld3_dup_f32(float* %a) {
 618 ; CHECK-LABEL: test_vld3_dup_f32
 619 ; CHECK: ld3r { {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s }, [x0]
 620 entry:
 621   %0 = bitcast float* %a to i8*
 622   %vld_dup = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8* %0, <2 x float> undef, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
 623   %1 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld_dup, 0
 624   %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
 625   %2 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld_dup, 1
 626   %lane1 = shufflevector <2 x float> %2, <2 x float> undef, <2 x i32> zeroinitializer
 627   %3 = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld_dup, 2
 628   %lane2 = shufflevector <2 x float> %3, <2 x float> undef, <2 x i32> zeroinitializer
 629   %.fca.0.0.insert = insertvalue %struct.float32x2x3_t undef, <2 x float> %lane, 0, 0
 630   %.fca.0.1.insert = insertvalue %struct.float32x2x3_t %.fca.0.0.insert, <2 x float> %lane1, 0, 1
 631   %.fca.0.2.insert = insertvalue %struct.float32x2x3_t %.fca.0.1.insert, <2 x float> %lane2, 0, 2
 632   ret %struct.float32x2x3_t %.fca.0.2.insert
 633 }
 634
 635 define %struct.float64x1x3_t @test_vld3_dup_f64(double* %a) {
 636 ; CHECK-LABEL: test_vld3_dup_f64
 637 ; CHECK: ld1 { {{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d }, [x0]
 638 entry:
 639   %0 = bitcast double* %a to i8*
 640   %vld_dup = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8* %0, i32 8)
 641   %vld_dup.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld_dup, 0
 642   %vld_dup.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld_dup, 1
 643   %vld_dup.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld_dup, 2
 644   %.fca.0.0.insert = insertvalue %struct.float64x1x3_t undef, <1 x double> %vld_dup.fca.0.extract, 0, 0
 645   %.fca.0.1.insert = insertvalue %struct.float64x1x3_t %.fca.0.0.insert, <1 x double> %vld_dup.fca.1.extract, 0, 1
 646   %.fca.0.2.insert = insertvalue %struct.float64x1x3_t %.fca.0.1.insert, <1 x double> %vld_dup.fca.2.extract, 0, 2
 647   ret %struct.float64x1x3_t %.fca.0.2.insert
 648 }
 649
 650 define %struct.int8x16x4_t @test_vld4q_dup_s8(i8* %a) {
 651 ; CHECK-LABEL: test_vld4q_dup_s8
 652 ; CHECK: ld4r { {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b, {{v[0-9]+}}.16b }, [x0]
 653 entry:
 654   %vld_dup = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4lane.v16i8(i8* %a, <16 x i8> undef, <16 x i8> undef, <16 x i8> undef, <16 x i8> undef, i32 0, i32 1)
 655   %0 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 0
 656   %lane = shufflevector <16 x i8> %0, <16 x i8> undef, <16 x i32> zeroinitializer
 657   %1 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 1
 658   %lane1 = shufflevector <16 x i8> %1, <16 x i8> undef, <16 x i32> zeroinitializer
 659   %2 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 2
 660   %lane2 = shufflevector <16 x i8> %2, <16 x i8> undef, <16 x i32> zeroinitializer
 661   %3 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld_dup, 3
 662   %lane3 = shufflevector <16 x i8> %3, <16 x i8> undef, <16 x i32> zeroinitializer
 663   %.fca.0.0.insert = insertvalue %struct.int8x16x4_t undef, <16 x i8> %lane, 0, 0
 664   %.fca.0.1.insert = insertvalue %struct.int8x16x4_t %.fca.0.0.insert, <16 x i8> %lane1, 0, 1
 665   %.fca.0.2.insert = insertvalue %struct.int8x16x4_t %.fca.0.1.insert, <16 x i8> %lane2, 0, 2
 666   %.fca.0.3.insert = insertvalue %struct.int8x16x4_t %.fca.0.2.insert, <16 x i8> %lane3, 0, 3
 667   ret %struct.int8x16x4_t %.fca.0.3.insert
 668 }
 669
 670 define %struct.int16x8x4_t @test_vld4q_dup_s16(i16* %a) {
 671 ; CHECK-LABEL: test_vld4q_dup_s16
 672 ; CHECK: ld4r { {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h, {{v[0-9]+}}.8h }, [x0]
 673 entry:
 674   %0 = bitcast i16* %a to i8*
 675   %vld_dup = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16(i8* %0, <8 x i16> undef, <8 x i16> undef, <8 x i16> undef, <8 x i16> undef, i32 0, i32 2)
 676   %1 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 0
 677   %lane = shufflevector <8 x i16> %1, <8 x i16> undef, <8 x i32> zeroinitializer
 678   %2 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 1
 679   %lane1 = shufflevector <8 x i16> %2, <8 x i16> undef, <8 x i32> zeroinitializer
 680   %3 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 2
 681   %lane2 = shufflevector <8 x i16> %3, <8 x i16> undef, <8 x i32> zeroinitializer
 682   %4 = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld_dup, 3
 683   %lane3 = shufflevector <8 x i16> %4, <8 x i16> undef, <8 x i32> zeroinitializer
 684   %.fca.0.0.insert = insertvalue %struct.int16x8x4_t undef, <8 x i16> %lane, 0, 0
 685   %.fca.0.1.insert = insertvalue %struct.int16x8x4_t %.fca.0.0.insert, <8 x i16> %lane1, 0, 1
 686   %.fca.0.2.insert = insertvalue %struct.int16x8x4_t %.fca.0.1.insert, <8 x i16> %lane2, 0, 2
 687   %.fca.0.3.insert = insertvalue %struct.int16x8x4_t %.fca.0.2.insert, <8 x i16> %lane3, 0, 3
 688   ret %struct.int16x8x4_t %.fca.0.3.insert
 689 }
 690
 691 define %struct.int32x4x4_t @test_vld4q_dup_s32(i32* %a) {
 692 ; CHECK-LABEL: test_vld4q_dup_s32
 693 ; CHECK: ld4r { {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s }, [x0]
 694 entry:
 695   %0 = bitcast i32* %a to i8*
 696   %vld_dup = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8* %0, <4 x i32> undef, <4 x i32> undef, <4 x i32> undef, <4 x i32> undef, i32 0, i32 4)
 697   %1 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 0
 698   %lane = shufflevector <4 x i32> %1, <4 x i32> undef, <4 x i32> zeroinitializer
 699   %2 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 1
 700   %lane1 = shufflevector <4 x i32> %2, <4 x i32> undef, <4 x i32> zeroinitializer
 701   %3 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 2
 702   %lane2 = shufflevector <4 x i32> %3, <4 x i32> undef, <4 x i32> zeroinitializer
 703   %4 = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld_dup, 3
 704   %lane3 = shufflevector <4 x i32> %4, <4 x i32> undef, <4 x i32> zeroinitializer
 705   %.fca.0.0.insert = insertvalue %struct.int32x4x4_t undef, <4 x i32> %lane, 0, 0
 706   %.fca.0.1.insert = insertvalue %struct.int32x4x4_t %.fca.0.0.insert, <4 x i32> %lane1, 0, 1
 707   %.fca.0.2.insert = insertvalue %struct.int32x4x4_t %.fca.0.1.insert, <4 x i32> %lane2, 0, 2
 708   %.fca.0.3.insert = insertvalue %struct.int32x4x4_t %.fca.0.2.insert, <4 x i32> %lane3, 0, 3
 709   ret %struct.int32x4x4_t %.fca.0.3.insert
 710 }
 711
 712 define %struct.int64x2x4_t @test_vld4q_dup_s64(i64* %a) {
 713 ; CHECK-LABEL: test_vld4q_dup_s64
 714 ; CHECK: ld4r { {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d }, [x0]
 715 entry:
 716   %0 = bitcast i64* %a to i8*
 717   %vld_dup = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4lane.v2i64(i8* %0, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, <2 x i64> undef, i32 0, i32 8)
 718   %1 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 0
 719   %lane = shufflevector <2 x i64> %1, <2 x i64> undef, <2 x i32> zeroinitializer
 720   %2 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 1
 721   %lane1 = shufflevector <2 x i64> %2, <2 x i64> undef, <2 x i32> zeroinitializer
 722   %3 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 2
 723   %lane2 = shufflevector <2 x i64> %3, <2 x i64> undef, <2 x i32> zeroinitializer
 724   %4 = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld_dup, 3
 725   %lane3 = shufflevector <2 x i64> %4, <2 x i64> undef, <2 x i32> zeroinitializer
 726   %.fca.0.0.insert = insertvalue %struct.int64x2x4_t undef, <2 x i64> %lane, 0, 0
 727   %.fca.0.1.insert = insertvalue %struct.int64x2x4_t %.fca.0.0.insert, <2 x i64> %lane1, 0, 1
 728   %.fca.0.2.insert = insertvalue %struct.int64x2x4_t %.fca.0.1.insert, <2 x i64> %lane2, 0, 2
 729   %.fca.0.3.insert = insertvalue %struct.int64x2x4_t %.fca.0.2.insert, <2 x i64> %lane3, 0, 3
 730   ret %struct.int64x2x4_t %.fca.0.3.insert
 731 }
 732
 733 define %struct.float32x4x4_t @test_vld4q_dup_f32(float* %a) {
 734 ; CHECK-LABEL: test_vld4q_dup_f32
 735 ; CHECK: ld4r { {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s, {{v[0-9]+}}.4s }, [x0]
 736 entry:
 737   %0 = bitcast float* %a to i8*
 738   %vld_dup = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32(i8* %0, <4 x float> undef, <4 x float> undef, <4 x float> undef, <4 x float> undef, i32 0, i32 4)
 739   %1 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 0
 740   %lane = shufflevector <4 x float> %1, <4 x float> undef, <4 x i32> zeroinitializer
 741   %2 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 1
 742   %lane1 = shufflevector <4 x float> %2, <4 x float> undef, <4 x i32> zeroinitializer
 743   %3 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 2
 744   %lane2 = shufflevector <4 x float> %3, <4 x float> undef, <4 x i32> zeroinitializer
 745   %4 = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld_dup, 3
 746   %lane3 = shufflevector <4 x float> %4, <4 x float> undef, <4 x i32> zeroinitializer
 747   %.fca.0.0.insert = insertvalue %struct.float32x4x4_t undef, <4 x float> %lane, 0, 0
 748   %.fca.0.1.insert = insertvalue %struct.float32x4x4_t %.fca.0.0.insert, <4 x float> %lane1, 0, 1
 749   %.fca.0.2.insert = insertvalue %struct.float32x4x4_t %.fca.0.1.insert, <4 x float> %lane2, 0, 2
 750   %.fca.0.3.insert = insertvalue %struct.float32x4x4_t %.fca.0.2.insert, <4 x float> %lane3, 0, 3
 751   ret %struct.float32x4x4_t %.fca.0.3.insert
 752 }
 753
 754 define %struct.float64x2x4_t @test_vld4q_dup_f64(double* %a) {
 755 ; CHECK-LABEL: test_vld4q_dup_f64
 756 ; CHECK: ld4r { {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d, {{v[0-9]+}}.2d }, [x0]
 757 entry:
 758   %0 = bitcast double* %a to i8*
 759   %vld_dup = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %0, <2 x double> undef, <2 x double> undef, <2 x double> undef, <2 x double> undef, i32 0, i32 8)
 760   %1 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 0
 761   %lane = shufflevector <2 x double> %1, <2 x double> undef, <2 x i32> zeroinitializer
 762   %2 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 1
 763   %lane1 = shufflevector <2 x double> %2, <2 x double> undef, <2 x i32> zeroinitializer
 764   %3 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 2
 765   %lane2 = shufflevector <2 x double> %3, <2 x double> undef, <2 x i32> zeroinitializer
 766   %4 = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld_dup, 3
 767   %lane3 = shufflevector <2 x double> %4, <2 x double> undef, <2 x i32> zeroinitializer
 768   %.fca.0.0.insert = insertvalue %struct.float64x2x4_t undef, <2 x double> %lane, 0, 0
 769   %.fca.0.1.insert = insertvalue %struct.float64x2x4_t %.fca.0.0.insert, <2 x double> %lane1, 0, 1
 770   %.fca.0.2.insert = insertvalue %struct.float64x2x4_t %.fca.0.1.insert, <2 x double> %lane2, 0, 2
 771   %.fca.0.3.insert = insertvalue %struct.float64x2x4_t %.fca.0.2.insert, <2 x double> %lane3, 0, 3
 772   ret %struct.float64x2x4_t %.fca.0.3.insert
 773 }
 774
 775 define %struct.int8x8x4_t @test_vld4_dup_s8(i8* %a) {
 776 ; CHECK-LABEL: test_vld4_dup_s8
 777 ; CHECK: ld4r { {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b, {{v[0-9]+}}.8b }, [x0]
 778 entry:
 779   %vld_dup = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8(i8* %a, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, <8 x i8> undef, i32 0, i32 1)
 780   %0 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 0
 781   %lane = shufflevector <8 x i8> %0, <8 x i8> undef, <8 x i32> zeroinitializer
 782   %1 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 1
 783   %lane1 = shufflevector <8 x i8> %1, <8 x i8> undef, <8 x i32> zeroinitializer
 784   %2 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 2
 785   %lane2 = shufflevector <8 x i8> %2, <8 x i8> undef, <8 x i32> zeroinitializer
 786   %3 = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld_dup, 3
 787   %lane3 = shufflevector <8 x i8> %3, <8 x i8> undef, <8 x i32> zeroinitializer
 788   %.fca.0.0.insert = insertvalue %struct.int8x8x4_t undef, <8 x i8> %lane, 0, 0
 789   %.fca.0.1.insert = insertvalue %struct.int8x8x4_t %.fca.0.0.insert, <8 x i8> %lane1, 0, 1
 790   %.fca.0.2.insert = insertvalue %struct.int8x8x4_t %.fca.0.1.insert, <8 x i8> %lane2, 0, 2
 791   %.fca.0.3.insert = insertvalue %struct.int8x8x4_t %.fca.0.2.insert, <8 x i8> %lane3, 0, 3
 792   ret %struct.int8x8x4_t %.fca.0.3.insert
 793 }
 794
 795 define %struct.int16x4x4_t @test_vld4_dup_s16(i16* %a) {
 796 ; CHECK-LABEL: test_vld4_dup_s16
 797 ; CHECK: ld4r { {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h, {{v[0-9]+}}.4h }, [x0]
 798 entry:
 799   %0 = bitcast i16* %a to i8*
 800   %vld_dup = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16(i8* %0, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, <4 x i16> undef, i32 0, i32 2)
 801   %1 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 0
 802   %lane = shufflevector <4 x i16> %1, <4 x i16> undef, <4 x i32> zeroinitializer
 803   %2 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 1
 804   %lane1 = shufflevector <4 x i16> %2, <4 x i16> undef, <4 x i32> zeroinitializer
 805   %3 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 2
 806   %lane2 = shufflevector <4 x i16> %3, <4 x i16> undef, <4 x i32> zeroinitializer
 807   %4 = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld_dup, 3
 808   %lane3 = shufflevector <4 x i16> %4, <4 x i16> undef, <4 x i32> zeroinitializer
 809   %.fca.0.0.insert = insertvalue %struct.int16x4x4_t undef, <4 x i16> %lane, 0, 0
 810   %.fca.0.1.insert = insertvalue %struct.int16x4x4_t %.fca.0.0.insert, <4 x i16> %lane1, 0, 1
 811   %.fca.0.2.insert = insertvalue %struct.int16x4x4_t %.fca.0.1.insert, <4 x i16> %lane2, 0, 2
 812   %.fca.0.3.insert = insertvalue %struct.int16x4x4_t %.fca.0.2.insert, <4 x i16> %lane3, 0, 3
 813   ret %struct.int16x4x4_t %.fca.0.3.insert
 814 }
 815
 816 define %struct.int32x2x4_t @test_vld4_dup_s32(i32* %a) {
 817 ; CHECK-LABEL: test_vld4_dup_s32
 818 ; CHECK: ld4r { {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s }, [x0]
 819 entry:
 820   %0 = bitcast i32* %a to i8*
 821   %vld_dup = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %0, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, <2 x i32> undef, i32 0, i32 4)
 822   %1 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 0
 823   %lane = shufflevector <2 x i32> %1, <2 x i32> undef, <2 x i32> zeroinitializer
 824   %2 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 1
 825   %lane1 = shufflevector <2 x i32> %2, <2 x i32> undef, <2 x i32> zeroinitializer
 826   %3 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 2
 827   %lane2 = shufflevector <2 x i32> %3, <2 x i32> undef, <2 x i32> zeroinitializer
 828   %4 = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld_dup, 3
 829   %lane3 = shufflevector <2 x i32> %4, <2 x i32> undef, <2 x i32> zeroinitializer
 830   %.fca.0.0.insert = insertvalue %struct.int32x2x4_t undef, <2 x i32> %lane, 0, 0
 831   %.fca.0.1.insert = insertvalue %struct.int32x2x4_t %.fca.0.0.insert, <2 x i32> %lane1, 0, 1
 832   %.fca.0.2.insert = insertvalue %struct.int32x2x4_t %.fca.0.1.insert, <2 x i32> %lane2, 0, 2
 833   %.fca.0.3.insert = insertvalue %struct.int32x2x4_t %.fca.0.2.insert, <2 x i32> %lane3, 0, 3
 834   ret %struct.int32x2x4_t %.fca.0.3.insert
 835 }
 836
 837 define %struct.int64x1x4_t @test_vld4_dup_s64(i64* %a) {
 838 ; CHECK-LABEL: test_vld4_dup_s64
 839 ; CHECK: ld1 { {{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d }, [x0]
 840 entry:
 841   %0 = bitcast i64* %a to i8*
 842   %vld_dup = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8* %0, i32 8)
 843   %vld_dup.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 0
 844   %vld_dup.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 1
 845   %vld_dup.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 2
 846   %vld_dup.fca.3.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld_dup, 3
 847   %.fca.0.0.insert = insertvalue %struct.int64x1x4_t undef, <1 x i64> %vld_dup.fca.0.extract, 0, 0
 848   %.fca.0.1.insert = insertvalue %struct.int64x1x4_t %.fca.0.0.insert, <1 x i64> %vld_dup.fca.1.extract, 0, 1
 849   %.fca.0.2.insert = insertvalue %struct.int64x1x4_t %.fca.0.1.insert, <1 x i64> %vld_dup.fca.2.extract, 0, 2
 850   %.fca.0.3.insert = insertvalue %struct.int64x1x4_t %.fca.0.2.insert, <1 x i64> %vld_dup.fca.3.extract, 0, 3
 851   ret %struct.int64x1x4_t %.fca.0.3.insert
 852 }
 853
 854 define %struct.float32x2x4_t @test_vld4_dup_f32(float* %a) {
 855 ; CHECK-LABEL: test_vld4_dup_f32
 856 ; CHECK: ld4r { {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s, {{v[0-9]+}}.2s }, [x0]
 857 entry:
 858   %0 = bitcast float* %a to i8*
 859   %vld_dup = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32(i8* %0, <2 x float> undef, <2 x float> undef, <2 x float> undef, <2 x float> undef, i32 0, i32 4)
 860   %1 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 0
 861   %lane = shufflevector <2 x float> %1, <2 x float> undef, <2 x i32> zeroinitializer
 862   %2 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 1
 863   %lane1 = shufflevector <2 x float> %2, <2 x float> undef, <2 x i32> zeroinitializer
 864   %3 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 2
 865   %lane2 = shufflevector <2 x float> %3, <2 x float> undef, <2 x i32> zeroinitializer
 866   %4 = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld_dup, 3
 867   %lane3 = shufflevector <2 x float> %4, <2 x float> undef, <2 x i32> zeroinitializer
 868   %.fca.0.0.insert = insertvalue %struct.float32x2x4_t undef, <2 x float> %lane, 0, 0
 869   %.fca.0.1.insert = insertvalue %struct.float32x2x4_t %.fca.0.0.insert, <2 x float> %lane1, 0, 1
 870   %.fca.0.2.insert = insertvalue %struct.float32x2x4_t %.fca.0.1.insert, <2 x float> %lane2, 0, 2
 871   %.fca.0.3.insert = insertvalue %struct.float32x2x4_t %.fca.0.2.insert, <2 x float> %lane3, 0, 3
 872   ret %struct.float32x2x4_t %.fca.0.3.insert
 873 }
 874
 875 define %struct.float64x1x4_t @test_vld4_dup_f64(double* %a) {
 876 ; CHECK-LABEL: test_vld4_dup_f64
 877 ; CHECK: ld1 { {{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d, {{v[0-9]+}}.1d }, [x0]
 878 entry:
 879   %0 = bitcast double* %a to i8*
 880   %vld_dup = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8* %0, i32 8)
 881   %vld_dup.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 0
 882   %vld_dup.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 1
 883   %vld_dup.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 2
 884   %vld_dup.fca.3.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld_dup, 3
 885   %.fca.0.0.insert = insertvalue %struct.float64x1x4_t undef, <1 x double> %vld_dup.fca.0.extract, 0, 0
 886   %.fca.0.1.insert = insertvalue %struct.float64x1x4_t %.fca.0.0.insert, <1 x double> %vld_dup.fca.1.extract, 0, 1
 887   %.fca.0.2.insert = insertvalue %struct.float64x1x4_t %.fca.0.1.insert, <1 x double> %vld_dup.fca.2.extract, 0, 2
 888   %.fca.0.3.insert = insertvalue %struct.float64x1x4_t %.fca.0.2.insert, <1 x double> %vld_dup.fca.3.extract, 0, 3
 889   ret %struct.float64x1x4_t %.fca.0.3.insert
 890 }
 891
 892 define <16 x i8> @test_vld1q_lane_s8(i8* %a, <16 x i8> %b) {
 893 ; CHECK-LABEL: test_vld1q_lane_s8
 894 ; CHECK: ld1 { {{v[0-9]+}}.b }[{{[0-9]+}}], [x0]
 895 entry:
 896   %0 = load i8* %a, align 1
 897   %vld1_lane = insertelement <16 x i8> %b, i8 %0, i32 15
 898   ret <16 x i8> %vld1_lane
 899 }
 900
 901 define <8 x i16> @test_vld1q_lane_s16(i16* %a, <8 x i16> %b) {
 902 ; CHECK-LABEL: test_vld1q_lane_s16
 903 ; CHECK: ld1 { {{v[0-9]+}}.h }[{{[0-9]+}}], [x0]
 904 entry:
 905   %0 = load i16* %a, align 2
 906   %vld1_lane = insertelement <8 x i16> %b, i16 %0, i32 7
 907   ret <8 x i16> %vld1_lane
 908 }
 909
 910 define <4 x i32> @test_vld1q_lane_s32(i32* %a, <4 x i32> %b) {
 911 ; CHECK-LABEL: test_vld1q_lane_s32
 912 ; CHECK: ld1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
 913 entry:
 914   %0 = load i32* %a, align 4
 915   %vld1_lane = insertelement <4 x i32> %b, i32 %0, i32 3
 916   ret <4 x i32> %vld1_lane
 917 }
 918
 919 define <2 x i64> @test_vld1q_lane_s64(i64* %a, <2 x i64> %b) {
 920 ; CHECK-LABEL: test_vld1q_lane_s64
 921 ; CHECK: ld1 { {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
 922 entry:
 923   %0 = load i64* %a, align 8
 924   %vld1_lane = insertelement <2 x i64> %b, i64 %0, i32 1
 925   ret <2 x i64> %vld1_lane
 926 }
 927
 928 define <4 x float> @test_vld1q_lane_f32(float* %a, <4 x float> %b) {
 929 ; CHECK-LABEL: test_vld1q_lane_f32
 930 ; CHECK: ld1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
 931 entry:
 932   %0 = load float* %a, align 4
 933   %vld1_lane = insertelement <4 x float> %b, float %0, i32 3
 934   ret <4 x float> %vld1_lane
 935 }
 936
 937 define <2 x double> @test_vld1q_lane_f64(double* %a, <2 x double> %b) {
 938 ; CHECK-LABEL: test_vld1q_lane_f64
 939 ; CHECK: ld1 { {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
 940 entry:
 941   %0 = load double* %a, align 8
 942   %vld1_lane = insertelement <2 x double> %b, double %0, i32 1
 943   ret <2 x double> %vld1_lane
 944 }
 945
 946 define <8 x i8> @test_vld1_lane_s8(i8* %a, <8 x i8> %b) {
 947 ; CHECK-LABEL: test_vld1_lane_s8
 948 ; CHECK: ld1 { {{v[0-9]+}}.b }[{{[0-9]+}}], [x0]
 949 entry:
 950   %0 = load i8* %a, align 1
 951   %vld1_lane = insertelement <8 x i8> %b, i8 %0, i32 7
 952   ret <8 x i8> %vld1_lane
 953 }
 954
 955 define <4 x i16> @test_vld1_lane_s16(i16* %a, <4 x i16> %b) {
 956 ; CHECK-LABEL: test_vld1_lane_s16
 957 ; CHECK: ld1 { {{v[0-9]+}}.h }[{{[0-9]+}}], [x0]
 958 entry:
 959   %0 = load i16* %a, align 2
 960   %vld1_lane = insertelement <4 x i16> %b, i16 %0, i32 3
 961   ret <4 x i16> %vld1_lane
 962 }
 963
 964 define <2 x i32> @test_vld1_lane_s32(i32* %a, <2 x i32> %b) {
 965 ; CHECK-LABEL: test_vld1_lane_s32
 966 ; CHECK: ld1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
 967 entry:
 968   %0 = load i32* %a, align 4
 969   %vld1_lane = insertelement <2 x i32> %b, i32 %0, i32 1
 970   ret <2 x i32> %vld1_lane
 971 }
 972
 973 define <1 x i64> @test_vld1_lane_s64(i64* %a, <1 x i64> %b) {
 974 ; CHECK-LABEL: test_vld1_lane_s64
 975 ; CHECK: ld1r { {{v[0-9]+}}.1d }, [x0]
 976 entry:
 977   %0 = load i64* %a, align 8
 978   %vld1_lane = insertelement <1 x i64> undef, i64 %0, i32 0
 979   ret <1 x i64> %vld1_lane
 980 }
 981
 982 define <2 x float> @test_vld1_lane_f32(float* %a, <2 x float> %b) {
 983 ; CHECK-LABEL: test_vld1_lane_f32
 984 ; CHECK: ld1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
 985 entry:
 986   %0 = load float* %a, align 4
 987   %vld1_lane = insertelement <2 x float> %b, float %0, i32 1
 988   ret <2 x float> %vld1_lane
 989 }
 990
 991 define <1 x double> @test_vld1_lane_f64(double* %a, <1 x double> %b) {
 992 ; CHECK-LABEL: test_vld1_lane_f64
 993 ; CHECK: ld1r { {{v[0-9]+}}.1d }, [x0]
 994 entry:
 995   %0 = load double* %a, align 8
 996   %vld1_lane = insertelement <1 x double> undef, double %0, i32 0
 997   ret <1 x double> %vld1_lane
 998 }
 999
1000 define %struct.int16x8x2_t @test_vld2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) {
1001 ; CHECK-LABEL: test_vld2q_lane_s16
1002 ; CHECK: ld2 { {{v[0-9]+}}.h, {{v[0-9]+}}.h }[{{[0-9]+}}], [x0]
1003 entry:
1004   %b.coerce.fca.0.extract = extractvalue [2 x <8 x i16>] %b.coerce, 0
1005   %b.coerce.fca.1.extract = extractvalue [2 x <8 x i16>] %b.coerce, 1
1006   %0 = bitcast i16* %a to i8*
1007   %vld2_lane = tail call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, i32 7, i32 2)
1008   %vld2_lane.fca.0.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2_lane, 0
1009   %vld2_lane.fca.1.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2_lane, 1
1010   %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %vld2_lane.fca.0.extract, 0, 0
1011   %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %vld2_lane.fca.1.extract, 0, 1
1012   ret %struct.int16x8x2_t %.fca.0.1.insert
1013 }
1014
1015 define %struct.int32x4x2_t @test_vld2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) {
1016 ; CHECK-LABEL: test_vld2q_lane_s32
1017 ; CHECK: ld2 { {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
1018 entry:
1019   %b.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %b.coerce, 0
1020   %b.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %b.coerce, 1
1021   %0 = bitcast i32* %a to i8*
1022   %vld2_lane = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, i32 3, i32 4)
1023   %vld2_lane.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2_lane, 0
1024   %vld2_lane.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2_lane, 1
1025   %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %vld2_lane.fca.0.extract, 0, 0
1026   %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %vld2_lane.fca.1.extract, 0, 1
1027   ret %struct.int32x4x2_t %.fca.0.1.insert
1028 }
1029
1030 define %struct.int64x2x2_t @test_vld2q_lane_s64(i64* %a, [2 x <2 x i64>] %b.coerce) {
1031 ; CHECK-LABEL: test_vld2q_lane_s64
1032 ; CHECK: ld2 { {{v[0-9]+}}.d, {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
1033 entry:
1034   %b.coerce.fca.0.extract = extractvalue [2 x <2 x i64>] %b.coerce, 0
1035   %b.coerce.fca.1.extract = extractvalue [2 x <2 x i64>] %b.coerce, 1
1036   %0 = bitcast i64* %a to i8*
1037   %vld2_lane = tail call { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, i32 1, i32 8)
1038   %vld2_lane.fca.0.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2_lane, 0
1039   %vld2_lane.fca.1.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2_lane, 1
1040   %.fca.0.0.insert = insertvalue %struct.int64x2x2_t undef, <2 x i64> %vld2_lane.fca.0.extract, 0, 0
1041   %.fca.0.1.insert = insertvalue %struct.int64x2x2_t %.fca.0.0.insert, <2 x i64> %vld2_lane.fca.1.extract, 0, 1
1042   ret %struct.int64x2x2_t %.fca.0.1.insert
1043 }
1044
1045 define %struct.float32x4x2_t @test_vld2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) {
1046 ; CHECK-LABEL: test_vld2q_lane_f32
1047 ; CHECK: ld2 { {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
1048 entry:
1049   %b.coerce.fca.0.extract = extractvalue [2 x <4 x float>] %b.coerce, 0
1050   %b.coerce.fca.1.extract = extractvalue [2 x <4 x float>] %b.coerce, 1
1051   %0 = bitcast float* %a to i8*
1052   %vld2_lane = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, i32 3, i32 4)
1053   %vld2_lane.fca.0.extract = extractvalue { <4 x float>, <4 x float> } %vld2_lane, 0
1054   %vld2_lane.fca.1.extract = extractvalue { <4 x float>, <4 x float> } %vld2_lane, 1
1055   %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %vld2_lane.fca.0.extract, 0, 0
1056   %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %vld2_lane.fca.1.extract, 0, 1
1057   ret %struct.float32x4x2_t %.fca.0.1.insert
1058 }
1059
1060 define %struct.float64x2x2_t @test_vld2q_lane_f64(double* %a, [2 x <2 x double>] %b.coerce) {
1061 ; CHECK-LABEL: test_vld2q_lane_f64
1062 ; CHECK: ld2 { {{v[0-9]+}}.d, {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
1063 entry:
1064   %b.coerce.fca.0.extract = extractvalue [2 x <2 x double>] %b.coerce, 0
1065   %b.coerce.fca.1.extract = extractvalue [2 x <2 x double>] %b.coerce, 1
1066   %0 = bitcast double* %a to i8*
1067   %vld2_lane = tail call { <2 x double>, <2 x double> } @llvm.arm.neon.vld2lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, i32 1, i32 8)
1068   %vld2_lane.fca.0.extract = extractvalue { <2 x double>, <2 x double> } %vld2_lane, 0
1069   %vld2_lane.fca.1.extract = extractvalue { <2 x double>, <2 x double> } %vld2_lane, 1
1070   %.fca.0.0.insert = insertvalue %struct.float64x2x2_t undef, <2 x double> %vld2_lane.fca.0.extract, 0, 0
1071   %.fca.0.1.insert = insertvalue %struct.float64x2x2_t %.fca.0.0.insert, <2 x double> %vld2_lane.fca.1.extract, 0, 1
1072   ret %struct.float64x2x2_t %.fca.0.1.insert
1073 }
1074
1075 define %struct.int8x8x2_t @test_vld2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) {
1076 ; CHECK-LABEL: test_vld2_lane_s8
1077 ; CHECK: ld2 { {{v[0-9]+}}.b, {{v[0-9]+}}.b }[{{[0-9]+}}], [x0]
1078 entry:
1079   %b.coerce.fca.0.extract = extractvalue [2 x <8 x i8>] %b.coerce, 0
1080   %b.coerce.fca.1.extract = extractvalue [2 x <8 x i8>] %b.coerce, 1
1081   %vld2_lane = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, i32 7, i32 1)
1082   %vld2_lane.fca.0.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2_lane, 0
1083   %vld2_lane.fca.1.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2_lane, 1
1084   %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %vld2_lane.fca.0.extract, 0, 0
1085   %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %vld2_lane.fca.1.extract, 0, 1
1086   ret %struct.int8x8x2_t %.fca.0.1.insert
1087 }
1088
1089 define %struct.int16x4x2_t @test_vld2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) {
1090 ; CHECK-LABEL: test_vld2_lane_s16
1091 ; CHECK: ld2 { {{v[0-9]+}}.h, {{v[0-9]+}}.h }[{{[0-9]+}}], [x0]
1092 entry:
1093   %b.coerce.fca.0.extract = extractvalue [2 x <4 x i16>] %b.coerce, 0
1094   %b.coerce.fca.1.extract = extractvalue [2 x <4 x i16>] %b.coerce, 1
1095   %0 = bitcast i16* %a to i8*
1096   %vld2_lane = tail call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, i32 3, i32 2)
1097   %vld2_lane.fca.0.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2_lane, 0
1098   %vld2_lane.fca.1.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2_lane, 1
1099   %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %vld2_lane.fca.0.extract, 0, 0
1100   %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %vld2_lane.fca.1.extract, 0, 1
1101   ret %struct.int16x4x2_t %.fca.0.1.insert
1102 }
1103
1104 define %struct.int32x2x2_t @test_vld2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) {
1105 ; CHECK-LABEL: test_vld2_lane_s32
1106 ; CHECK: ld2 { {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
1107 entry:
1108   %b.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %b.coerce, 0
1109   %b.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %b.coerce, 1
1110   %0 = bitcast i32* %a to i8*
1111   %vld2_lane = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, i32 1, i32 4)
1112   %vld2_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2_lane, 0
1113   %vld2_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2_lane, 1
1114   %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %vld2_lane.fca.0.extract, 0, 0
1115   %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %vld2_lane.fca.1.extract, 0, 1
1116   ret %struct.int32x2x2_t %.fca.0.1.insert
1117 }
1118
1119 define %struct.int64x1x2_t @test_vld2_lane_s64(i64* %a, [2 x <1 x i64>] %b.coerce) {
1120 ; CHECK-LABEL: test_vld2_lane_s64
1121 ; CHECK: ld2 { {{v[0-9]+}}.d, {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
1122 entry:
1123   %b.coerce.fca.0.extract = extractvalue [2 x <1 x i64>] %b.coerce, 0
1124   %b.coerce.fca.1.extract = extractvalue [2 x <1 x i64>] %b.coerce, 1
1125   %0 = bitcast i64* %a to i8*
1126   %vld2_lane = tail call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, i32 0, i32 8)
1127   %vld2_lane.fca.0.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2_lane, 0
1128   %vld2_lane.fca.1.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2_lane, 1
1129   %.fca.0.0.insert = insertvalue %struct.int64x1x2_t undef, <1 x i64> %vld2_lane.fca.0.extract, 0, 0
1130   %.fca.0.1.insert = insertvalue %struct.int64x1x2_t %.fca.0.0.insert, <1 x i64> %vld2_lane.fca.1.extract, 0, 1
1131   ret %struct.int64x1x2_t %.fca.0.1.insert
1132 }
1133
1134 define %struct.float32x2x2_t @test_vld2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) {
1135 ; CHECK-LABEL: test_vld2_lane_f32
1136 ; CHECK: ld2 { {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
1137 entry:
1138   %b.coerce.fca.0.extract = extractvalue [2 x <2 x float>] %b.coerce, 0
1139   %b.coerce.fca.1.extract = extractvalue [2 x <2 x float>] %b.coerce, 1
1140   %0 = bitcast float* %a to i8*
1141   %vld2_lane = tail call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, i32 1, i32 4)
1142   %vld2_lane.fca.0.extract = extractvalue { <2 x float>, <2 x float> } %vld2_lane, 0
1143   %vld2_lane.fca.1.extract = extractvalue { <2 x float>, <2 x float> } %vld2_lane, 1
1144   %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %vld2_lane.fca.0.extract, 0, 0
1145   %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %vld2_lane.fca.1.extract, 0, 1
1146   ret %struct.float32x2x2_t %.fca.0.1.insert
1147 }
1148
1149 define %struct.float64x1x2_t @test_vld2_lane_f64(double* %a, [2 x <1 x double>] %b.coerce) {
1150 ; CHECK-LABEL: test_vld2_lane_f64
1151 ; CHECK: ld2 { {{v[0-9]+}}.d, {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
1152 entry:
1153   %b.coerce.fca.0.extract = extractvalue [2 x <1 x double>] %b.coerce, 0
1154   %b.coerce.fca.1.extract = extractvalue [2 x <1 x double>] %b.coerce, 1
1155   %0 = bitcast double* %a to i8*
1156   %vld2_lane = tail call { <1 x double>, <1 x double> } @llvm.arm.neon.vld2lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, i32 0, i32 8)
1157   %vld2_lane.fca.0.extract = extractvalue { <1 x double>, <1 x double> } %vld2_lane, 0
1158   %vld2_lane.fca.1.extract = extractvalue { <1 x double>, <1 x double> } %vld2_lane, 1
1159   %.fca.0.0.insert = insertvalue %struct.float64x1x2_t undef, <1 x double> %vld2_lane.fca.0.extract, 0, 0
1160   %.fca.0.1.insert = insertvalue %struct.float64x1x2_t %.fca.0.0.insert, <1 x double> %vld2_lane.fca.1.extract, 0, 1
1161   ret %struct.float64x1x2_t %.fca.0.1.insert
1162 }
1163
1164 define %struct.int16x8x3_t @test_vld3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) {
1165 ; CHECK-LABEL: test_vld3q_lane_s16
1166 ; CHECK: ld3 { {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h }[{{[0-9]+}}], [x0]
1167 entry:
1168   %b.coerce.fca.0.extract = extractvalue [3 x <8 x i16>] %b.coerce, 0
1169   %b.coerce.fca.1.extract = extractvalue [3 x <8 x i16>] %b.coerce, 1
1170   %b.coerce.fca.2.extract = extractvalue [3 x <8 x i16>] %b.coerce, 2
1171   %0 = bitcast i16* %a to i8*
1172   %vld3_lane = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, i32 7, i32 2)
1173   %vld3_lane.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 0
1174   %vld3_lane.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 1
1175   %vld3_lane.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 2
1176   %.fca.0.0.insert = insertvalue %struct.int16x8x3_t undef, <8 x i16> %vld3_lane.fca.0.extract, 0, 0
1177   %.fca.0.1.insert = insertvalue %struct.int16x8x3_t %.fca.0.0.insert, <8 x i16> %vld3_lane.fca.1.extract, 0, 1
1178   %.fca.0.2.insert = insertvalue %struct.int16x8x3_t %.fca.0.1.insert, <8 x i16> %vld3_lane.fca.2.extract, 0, 2
1179   ret %struct.int16x8x3_t %.fca.0.2.insert
1180 }
1181
1182 define %struct.int32x4x3_t @test_vld3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) {
1183 ; CHECK-LABEL: test_vld3q_lane_s32
1184 ; CHECK: ld3 { {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
1185 entry:
1186   %b.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %b.coerce, 0
1187   %b.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %b.coerce, 1
1188   %b.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %b.coerce, 2
1189   %0 = bitcast i32* %a to i8*
1190   %vld3_lane = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, i32 3, i32 4)
1191   %vld3_lane.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 0
1192   %vld3_lane.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 1
1193   %vld3_lane.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 2
1194   %.fca.0.0.insert = insertvalue %struct.int32x4x3_t undef, <4 x i32> %vld3_lane.fca.0.extract, 0, 0
1195   %.fca.0.1.insert = insertvalue %struct.int32x4x3_t %.fca.0.0.insert, <4 x i32> %vld3_lane.fca.1.extract, 0, 1
1196   %.fca.0.2.insert = insertvalue %struct.int32x4x3_t %.fca.0.1.insert, <4 x i32> %vld3_lane.fca.2.extract, 0, 2
1197   ret %struct.int32x4x3_t %.fca.0.2.insert
1198 }
1199
1200 define %struct.int64x2x3_t @test_vld3q_lane_s64(i64* %a, [3 x <2 x i64>] %b.coerce) {
1201 ; CHECK-LABEL: test_vld3q_lane_s64
1202 ; CHECK: ld3 { {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
1203 entry:
1204   %b.coerce.fca.0.extract = extractvalue [3 x <2 x i64>] %b.coerce, 0
1205   %b.coerce.fca.1.extract = extractvalue [3 x <2 x i64>] %b.coerce, 1
1206   %b.coerce.fca.2.extract = extractvalue [3 x <2 x i64>] %b.coerce, 2
1207   %0 = bitcast i64* %a to i8*
1208   %vld3_lane = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, i32 1, i32 8)
1209   %vld3_lane.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 0
1210   %vld3_lane.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 1
1211   %vld3_lane.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 2
1212   %.fca.0.0.insert = insertvalue %struct.int64x2x3_t undef, <2 x i64> %vld3_lane.fca.0.extract, 0, 0
1213   %.fca.0.1.insert = insertvalue %struct.int64x2x3_t %.fca.0.0.insert, <2 x i64> %vld3_lane.fca.1.extract, 0, 1
1214   %.fca.0.2.insert = insertvalue %struct.int64x2x3_t %.fca.0.1.insert, <2 x i64> %vld3_lane.fca.2.extract, 0, 2
1215   ret %struct.int64x2x3_t %.fca.0.2.insert
1216 }
1217
1218 define %struct.float32x4x3_t @test_vld3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) {
1219 ; CHECK-LABEL: test_vld3q_lane_f32
1220 ; CHECK: ld3 { {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
1221 entry:
1222   %b.coerce.fca.0.extract = extractvalue [3 x <4 x float>] %b.coerce, 0
1223   %b.coerce.fca.1.extract = extractvalue [3 x <4 x float>] %b.coerce, 1
1224   %b.coerce.fca.2.extract = extractvalue [3 x <4 x float>] %b.coerce, 2
1225   %0 = bitcast float* %a to i8*
1226   %vld3_lane = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, i32 3, i32 4)
1227   %vld3_lane.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 0
1228   %vld3_lane.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 1
1229   %vld3_lane.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 2
1230   %.fca.0.0.insert = insertvalue %struct.float32x4x3_t undef, <4 x float> %vld3_lane.fca.0.extract, 0, 0
1231   %.fca.0.1.insert = insertvalue %struct.float32x4x3_t %.fca.0.0.insert, <4 x float> %vld3_lane.fca.1.extract, 0, 1
1232   %.fca.0.2.insert = insertvalue %struct.float32x4x3_t %.fca.0.1.insert, <4 x float> %vld3_lane.fca.2.extract, 0, 2
1233   ret %struct.float32x4x3_t %.fca.0.2.insert
1234 }
1235
1236 define %struct.float64x2x3_t @test_vld3q_lane_f64(double* %a, [3 x <2 x double>] %b.coerce) {
1237 ; CHECK-LABEL: test_vld3q_lane_f64
1238 ; CHECK: ld3 { {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
1239 entry:
1240   %b.coerce.fca.0.extract = extractvalue [3 x <2 x double>] %b.coerce, 0
1241   %b.coerce.fca.1.extract = extractvalue [3 x <2 x double>] %b.coerce, 1
1242   %b.coerce.fca.2.extract = extractvalue [3 x <2 x double>] %b.coerce, 2
1243   %0 = bitcast double* %a to i8*
1244   %vld3_lane = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, i32 1, i32 8)
1245   %vld3_lane.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 0
1246   %vld3_lane.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 1
1247   %vld3_lane.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 2
1248   %.fca.0.0.insert = insertvalue %struct.float64x2x3_t undef, <2 x double> %vld3_lane.fca.0.extract, 0, 0
1249   %.fca.0.1.insert = insertvalue %struct.float64x2x3_t %.fca.0.0.insert, <2 x double> %vld3_lane.fca.1.extract, 0, 1
1250   %.fca.0.2.insert = insertvalue %struct.float64x2x3_t %.fca.0.1.insert, <2 x double> %vld3_lane.fca.2.extract, 0, 2
1251   ret %struct.float64x2x3_t %.fca.0.2.insert
1252 }
1253
1254 define %struct.int8x8x3_t @test_vld3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) {
1255 ; CHECK-LABEL: test_vld3_lane_s8
1256 ; CHECK: ld3 { {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b }[{{[0-9]+}}], [x0]
1257 entry:
1258   %b.coerce.fca.0.extract = extractvalue [3 x <8 x i8>] %b.coerce, 0
1259   %b.coerce.fca.1.extract = extractvalue [3 x <8 x i8>] %b.coerce, 1
1260   %b.coerce.fca.2.extract = extractvalue [3 x <8 x i8>] %b.coerce, 2
1261   %vld3_lane = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, i32 7, i32 1)
1262   %vld3_lane.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 0
1263   %vld3_lane.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 1
1264   %vld3_lane.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 2
1265   %.fca.0.0.insert = insertvalue %struct.int8x8x3_t undef, <8 x i8> %vld3_lane.fca.0.extract, 0, 0
1266   %.fca.0.1.insert = insertvalue %struct.int8x8x3_t %.fca.0.0.insert, <8 x i8> %vld3_lane.fca.1.extract, 0, 1
1267   %.fca.0.2.insert = insertvalue %struct.int8x8x3_t %.fca.0.1.insert, <8 x i8> %vld3_lane.fca.2.extract, 0, 2
1268   ret %struct.int8x8x3_t %.fca.0.2.insert
1269 }
1270
1271 define %struct.int16x4x3_t @test_vld3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) {
1272 ; CHECK-LABEL: test_vld3_lane_s16
1273 ; CHECK: ld3 { {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h }[{{[0-9]+}}], [x0]
1274 entry:
1275   %b.coerce.fca.0.extract = extractvalue [3 x <4 x i16>] %b.coerce, 0
1276   %b.coerce.fca.1.extract = extractvalue [3 x <4 x i16>] %b.coerce, 1
1277   %b.coerce.fca.2.extract = extractvalue [3 x <4 x i16>] %b.coerce, 2
1278   %0 = bitcast i16* %a to i8*
1279   %vld3_lane = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, i32 3, i32 2)
1280   %vld3_lane.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 0
1281   %vld3_lane.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 1
1282   %vld3_lane.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 2
1283   %.fca.0.0.insert = insertvalue %struct.int16x4x3_t undef, <4 x i16> %vld3_lane.fca.0.extract, 0, 0
1284   %.fca.0.1.insert = insertvalue %struct.int16x4x3_t %.fca.0.0.insert, <4 x i16> %vld3_lane.fca.1.extract, 0, 1
1285   %.fca.0.2.insert = insertvalue %struct.int16x4x3_t %.fca.0.1.insert, <4 x i16> %vld3_lane.fca.2.extract, 0, 2
1286   ret %struct.int16x4x3_t %.fca.0.2.insert
1287 }
1288
1289 define %struct.int32x2x3_t @test_vld3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) {
1290 ; CHECK-LABEL: test_vld3_lane_s32
1291 ; CHECK: ld3 { {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
1292 entry:
1293   %b.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %b.coerce, 0
1294   %b.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %b.coerce, 1
1295   %b.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %b.coerce, 2
1296   %0 = bitcast i32* %a to i8*
1297   %vld3_lane = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, i32 1, i32 4)
1298   %vld3_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 0
1299   %vld3_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 1
1300   %vld3_lane.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 2
1301   %.fca.0.0.insert = insertvalue %struct.int32x2x3_t undef, <2 x i32> %vld3_lane.fca.0.extract, 0, 0
1302   %.fca.0.1.insert = insertvalue %struct.int32x2x3_t %.fca.0.0.insert, <2 x i32> %vld3_lane.fca.1.extract, 0, 1
1303   %.fca.0.2.insert = insertvalue %struct.int32x2x3_t %.fca.0.1.insert, <2 x i32> %vld3_lane.fca.2.extract, 0, 2
1304   ret %struct.int32x2x3_t %.fca.0.2.insert
1305 }
1306
1307 define %struct.int64x1x3_t @test_vld3_lane_s64(i64* %a, [3 x <1 x i64>] %b.coerce) {
1308 ; CHECK-LABEL: test_vld3_lane_s64
1309 ; CHECK: ld3 { {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
1310 entry:
1311   %b.coerce.fca.0.extract = extractvalue [3 x <1 x i64>] %b.coerce, 0
1312   %b.coerce.fca.1.extract = extractvalue [3 x <1 x i64>] %b.coerce, 1
1313   %b.coerce.fca.2.extract = extractvalue [3 x <1 x i64>] %b.coerce, 2
1314   %0 = bitcast i64* %a to i8*
1315   %vld3_lane = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, i32 0, i32 8)
1316   %vld3_lane.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 0
1317   %vld3_lane.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 1
1318   %vld3_lane.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 2
1319   %.fca.0.0.insert = insertvalue %struct.int64x1x3_t undef, <1 x i64> %vld3_lane.fca.0.extract, 0, 0
1320   %.fca.0.1.insert = insertvalue %struct.int64x1x3_t %.fca.0.0.insert, <1 x i64> %vld3_lane.fca.1.extract, 0, 1
1321   %.fca.0.2.insert = insertvalue %struct.int64x1x3_t %.fca.0.1.insert, <1 x i64> %vld3_lane.fca.2.extract, 0, 2
1322   ret %struct.int64x1x3_t %.fca.0.2.insert
1323 }
1324
1325 define %struct.float32x2x3_t @test_vld3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) {
1326 ; CHECK-LABEL: test_vld3_lane_f32
1327 ; CHECK: ld3 { {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
1328 entry:
1329   %b.coerce.fca.0.extract = extractvalue [3 x <2 x float>] %b.coerce, 0
1330   %b.coerce.fca.1.extract = extractvalue [3 x <2 x float>] %b.coerce, 1
1331   %b.coerce.fca.2.extract = extractvalue [3 x <2 x float>] %b.coerce, 2
1332   %0 = bitcast float* %a to i8*
1333   %vld3_lane = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, i32 1, i32 4)
1334   %vld3_lane.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 0
1335   %vld3_lane.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 1
1336   %vld3_lane.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 2
1337   %.fca.0.0.insert = insertvalue %struct.float32x2x3_t undef, <2 x float> %vld3_lane.fca.0.extract, 0, 0
1338   %.fca.0.1.insert = insertvalue %struct.float32x2x3_t %.fca.0.0.insert, <2 x float> %vld3_lane.fca.1.extract, 0, 1
1339   %.fca.0.2.insert = insertvalue %struct.float32x2x3_t %.fca.0.1.insert, <2 x float> %vld3_lane.fca.2.extract, 0, 2
1340   ret %struct.float32x2x3_t %.fca.0.2.insert
1341 }
1342
1343 define %struct.float64x1x3_t @test_vld3_lane_f64(double* %a, [3 x <1 x double>] %b.coerce) {
1344 ; CHECK-LABEL: test_vld3_lane_f64
1345 ; CHECK: ld3 { {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
1346 entry:
1347   %b.coerce.fca.0.extract = extractvalue [3 x <1 x double>] %b.coerce, 0
1348   %b.coerce.fca.1.extract = extractvalue [3 x <1 x double>] %b.coerce, 1
1349   %b.coerce.fca.2.extract = extractvalue [3 x <1 x double>] %b.coerce, 2
1350   %0 = bitcast double* %a to i8*
1351   %vld3_lane = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, i32 0, i32 8)
1352   %vld3_lane.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 0
1353   %vld3_lane.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 1
1354   %vld3_lane.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 2
1355   %.fca.0.0.insert = insertvalue %struct.float64x1x3_t undef, <1 x double> %vld3_lane.fca.0.extract, 0, 0
1356   %.fca.0.1.insert = insertvalue %struct.float64x1x3_t %.fca.0.0.insert, <1 x double> %vld3_lane.fca.1.extract, 0, 1
1357   %.fca.0.2.insert = insertvalue %struct.float64x1x3_t %.fca.0.1.insert, <1 x double> %vld3_lane.fca.2.extract, 0, 2
1358   ret %struct.float64x1x3_t %.fca.0.2.insert
1359 }
1360
1361 define %struct.int8x16x4_t @test_vld4q_lane_s8(i8* %a, [4 x <16 x i8>] %b.coerce) {
1362 ; CHECK-LABEL: test_vld4q_lane_s8
1363 ; CHECK: ld4 { {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b }[{{[0-9]+}}], [x0]
1364 entry:
1365   %b.coerce.fca.0.extract = extractvalue [4 x <16 x i8>] %b.coerce, 0
1366   %b.coerce.fca.1.extract = extractvalue [4 x <16 x i8>] %b.coerce, 1
1367   %b.coerce.fca.2.extract = extractvalue [4 x <16 x i8>] %b.coerce, 2
1368   %b.coerce.fca.3.extract = extractvalue [4 x <16 x i8>] %b.coerce, 3
1369   %vld3_lane = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4lane.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, <16 x i8> %b.coerce.fca.3.extract, i32 15, i32 1)
1370   %vld3_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 0
1371   %vld3_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 1
1372   %vld3_lane.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 2
1373   %vld3_lane.fca.3.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 3
1374   %.fca.0.0.insert = insertvalue %struct.int8x16x4_t undef, <16 x i8> %vld3_lane.fca.0.extract, 0, 0
1375   %.fca.0.1.insert = insertvalue %struct.int8x16x4_t %.fca.0.0.insert, <16 x i8> %vld3_lane.fca.1.extract, 0, 1
1376   %.fca.0.2.insert = insertvalue %struct.int8x16x4_t %.fca.0.1.insert, <16 x i8> %vld3_lane.fca.2.extract, 0, 2
1377   %.fca.0.3.insert = insertvalue %struct.int8x16x4_t %.fca.0.2.insert, <16 x i8> %vld3_lane.fca.3.extract, 0, 3
1378   ret %struct.int8x16x4_t %.fca.0.3.insert
1379 }
1380
1381 define %struct.int16x8x4_t @test_vld4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) {
1382 ; CHECK-LABEL: test_vld4q_lane_s16
1383 ; CHECK: ld4 { {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h }[{{[0-9]+}}], [x0]
1384 entry:
1385   %b.coerce.fca.0.extract = extractvalue [4 x <8 x i16>] %b.coerce, 0
1386   %b.coerce.fca.1.extract = extractvalue [4 x <8 x i16>] %b.coerce, 1
1387   %b.coerce.fca.2.extract = extractvalue [4 x <8 x i16>] %b.coerce, 2
1388   %b.coerce.fca.3.extract = extractvalue [4 x <8 x i16>] %b.coerce, 3
1389   %0 = bitcast i16* %a to i8*
1390   %vld3_lane = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, <8 x i16> %b.coerce.fca.3.extract, i32 7, i32 2)
1391   %vld3_lane.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 0
1392   %vld3_lane.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 1
1393   %vld3_lane.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 2
1394   %vld3_lane.fca.3.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld3_lane, 3
1395   %.fca.0.0.insert = insertvalue %struct.int16x8x4_t undef, <8 x i16> %vld3_lane.fca.0.extract, 0, 0
1396   %.fca.0.1.insert = insertvalue %struct.int16x8x4_t %.fca.0.0.insert, <8 x i16> %vld3_lane.fca.1.extract, 0, 1
1397   %.fca.0.2.insert = insertvalue %struct.int16x8x4_t %.fca.0.1.insert, <8 x i16> %vld3_lane.fca.2.extract, 0, 2
1398   %.fca.0.3.insert = insertvalue %struct.int16x8x4_t %.fca.0.2.insert, <8 x i16> %vld3_lane.fca.3.extract, 0, 3
1399   ret %struct.int16x8x4_t %.fca.0.3.insert
1400 }
1401
1402 define %struct.int32x4x4_t @test_vld4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) {
1403 ; CHECK-LABEL: test_vld4q_lane_s32
1404 ; CHECK: ld4 { {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
1405 entry:
1406   %b.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %b.coerce, 0
1407   %b.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %b.coerce, 1
1408   %b.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %b.coerce, 2
1409   %b.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %b.coerce, 3
1410   %0 = bitcast i32* %a to i8*
1411   %vld3_lane = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, <4 x i32> %b.coerce.fca.3.extract, i32 3, i32 4)
1412   %vld3_lane.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 0
1413   %vld3_lane.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 1
1414   %vld3_lane.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 2
1415   %vld3_lane.fca.3.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld3_lane, 3
1416   %.fca.0.0.insert = insertvalue %struct.int32x4x4_t undef, <4 x i32> %vld3_lane.fca.0.extract, 0, 0
1417   %.fca.0.1.insert = insertvalue %struct.int32x4x4_t %.fca.0.0.insert, <4 x i32> %vld3_lane.fca.1.extract, 0, 1
1418   %.fca.0.2.insert = insertvalue %struct.int32x4x4_t %.fca.0.1.insert, <4 x i32> %vld3_lane.fca.2.extract, 0, 2
1419   %.fca.0.3.insert = insertvalue %struct.int32x4x4_t %.fca.0.2.insert, <4 x i32> %vld3_lane.fca.3.extract, 0, 3
1420   ret %struct.int32x4x4_t %.fca.0.3.insert
1421 }
1422
1423 define %struct.int64x2x4_t @test_vld4q_lane_s64(i64* %a, [4 x <2 x i64>] %b.coerce) {
1424 ; CHECK-LABEL: test_vld4q_lane_s64
1425 ; CHECK: ld4 { {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
1426 entry:
1427   %b.coerce.fca.0.extract = extractvalue [4 x <2 x i64>] %b.coerce, 0
1428   %b.coerce.fca.1.extract = extractvalue [4 x <2 x i64>] %b.coerce, 1
1429   %b.coerce.fca.2.extract = extractvalue [4 x <2 x i64>] %b.coerce, 2
1430   %b.coerce.fca.3.extract = extractvalue [4 x <2 x i64>] %b.coerce, 3
1431   %0 = bitcast i64* %a to i8*
1432   %vld3_lane = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, <2 x i64> %b.coerce.fca.3.extract, i32 1, i32 8)
1433   %vld3_lane.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 0
1434   %vld3_lane.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 1
1435   %vld3_lane.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 2
1436   %vld3_lane.fca.3.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld3_lane, 3
1437   %.fca.0.0.insert = insertvalue %struct.int64x2x4_t undef, <2 x i64> %vld3_lane.fca.0.extract, 0, 0
1438   %.fca.0.1.insert = insertvalue %struct.int64x2x4_t %.fca.0.0.insert, <2 x i64> %vld3_lane.fca.1.extract, 0, 1
1439   %.fca.0.2.insert = insertvalue %struct.int64x2x4_t %.fca.0.1.insert, <2 x i64> %vld3_lane.fca.2.extract, 0, 2
1440   %.fca.0.3.insert = insertvalue %struct.int64x2x4_t %.fca.0.2.insert, <2 x i64> %vld3_lane.fca.3.extract, 0, 3
1441   ret %struct.int64x2x4_t %.fca.0.3.insert
1442 }
1443
1444 define %struct.float32x4x4_t @test_vld4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) {
1445 ; CHECK-LABEL: test_vld4q_lane_f32
1446 ; CHECK: ld4 { {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
1447 entry:
1448   %b.coerce.fca.0.extract = extractvalue [4 x <4 x float>] %b.coerce, 0
1449   %b.coerce.fca.1.extract = extractvalue [4 x <4 x float>] %b.coerce, 1
1450   %b.coerce.fca.2.extract = extractvalue [4 x <4 x float>] %b.coerce, 2
1451   %b.coerce.fca.3.extract = extractvalue [4 x <4 x float>] %b.coerce, 3
1452   %0 = bitcast float* %a to i8*
1453   %vld3_lane = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, <4 x float> %b.coerce.fca.3.extract, i32 3, i32 4)
1454   %vld3_lane.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 0
1455   %vld3_lane.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 1
1456   %vld3_lane.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 2
1457   %vld3_lane.fca.3.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld3_lane, 3
1458   %.fca.0.0.insert = insertvalue %struct.float32x4x4_t undef, <4 x float> %vld3_lane.fca.0.extract, 0, 0
1459   %.fca.0.1.insert = insertvalue %struct.float32x4x4_t %.fca.0.0.insert, <4 x float> %vld3_lane.fca.1.extract, 0, 1
1460   %.fca.0.2.insert = insertvalue %struct.float32x4x4_t %.fca.0.1.insert, <4 x float> %vld3_lane.fca.2.extract, 0, 2
1461   %.fca.0.3.insert = insertvalue %struct.float32x4x4_t %.fca.0.2.insert, <4 x float> %vld3_lane.fca.3.extract, 0, 3
1462   ret %struct.float32x4x4_t %.fca.0.3.insert
1463 }
1464
1465 define %struct.float64x2x4_t @test_vld4q_lane_f64(double* %a, [4 x <2 x double>] %b.coerce) {
1466 ; CHECK-LABEL: test_vld4q_lane_f64
1467 ; CHECK: ld4 { {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
1468 entry:
1469   %b.coerce.fca.0.extract = extractvalue [4 x <2 x double>] %b.coerce, 0
1470   %b.coerce.fca.1.extract = extractvalue [4 x <2 x double>] %b.coerce, 1
1471   %b.coerce.fca.2.extract = extractvalue [4 x <2 x double>] %b.coerce, 2
1472   %b.coerce.fca.3.extract = extractvalue [4 x <2 x double>] %b.coerce, 3
1473   %0 = bitcast double* %a to i8*
1474   %vld3_lane = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, <2 x double> %b.coerce.fca.3.extract, i32 1, i32 8)
1475   %vld3_lane.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 0
1476   %vld3_lane.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 1
1477   %vld3_lane.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 2
1478   %vld3_lane.fca.3.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld3_lane, 3
1479   %.fca.0.0.insert = insertvalue %struct.float64x2x4_t undef, <2 x double> %vld3_lane.fca.0.extract, 0, 0
1480   %.fca.0.1.insert = insertvalue %struct.float64x2x4_t %.fca.0.0.insert, <2 x double> %vld3_lane.fca.1.extract, 0, 1
1481   %.fca.0.2.insert = insertvalue %struct.float64x2x4_t %.fca.0.1.insert, <2 x double> %vld3_lane.fca.2.extract, 0, 2
1482   %.fca.0.3.insert = insertvalue %struct.float64x2x4_t %.fca.0.2.insert, <2 x double> %vld3_lane.fca.3.extract, 0, 3
1483   ret %struct.float64x2x4_t %.fca.0.3.insert
1484 }
1485
1486 define %struct.int8x8x4_t @test_vld4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) {
1487 ; CHECK-LABEL: test_vld4_lane_s8
1488 ; CHECK: ld4 { {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b }[{{[0-9]+}}], [x0]
1489 entry:
1490   %b.coerce.fca.0.extract = extractvalue [4 x <8 x i8>] %b.coerce, 0
1491   %b.coerce.fca.1.extract = extractvalue [4 x <8 x i8>] %b.coerce, 1
1492   %b.coerce.fca.2.extract = extractvalue [4 x <8 x i8>] %b.coerce, 2
1493   %b.coerce.fca.3.extract = extractvalue [4 x <8 x i8>] %b.coerce, 3
1494   %vld3_lane = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, <8 x i8> %b.coerce.fca.3.extract, i32 7, i32 1)
1495   %vld3_lane.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 0
1496   %vld3_lane.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 1
1497   %vld3_lane.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 2
1498   %vld3_lane.fca.3.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld3_lane, 3
1499   %.fca.0.0.insert = insertvalue %struct.int8x8x4_t undef, <8 x i8> %vld3_lane.fca.0.extract, 0, 0
1500   %.fca.0.1.insert = insertvalue %struct.int8x8x4_t %.fca.0.0.insert, <8 x i8> %vld3_lane.fca.1.extract, 0, 1
1501   %.fca.0.2.insert = insertvalue %struct.int8x8x4_t %.fca.0.1.insert, <8 x i8> %vld3_lane.fca.2.extract, 0, 2
1502   %.fca.0.3.insert = insertvalue %struct.int8x8x4_t %.fca.0.2.insert, <8 x i8> %vld3_lane.fca.3.extract, 0, 3
1503   ret %struct.int8x8x4_t %.fca.0.3.insert
1504 }
1505
1506 define %struct.int16x4x4_t @test_vld4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) {
1507 ; CHECK-LABEL: test_vld4_lane_s16
1508 ; CHECK: ld4 { {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h }[{{[0-9]+}}], [x0]
1509 entry:
1510   %b.coerce.fca.0.extract = extractvalue [4 x <4 x i16>] %b.coerce, 0
1511   %b.coerce.fca.1.extract = extractvalue [4 x <4 x i16>] %b.coerce, 1
1512   %b.coerce.fca.2.extract = extractvalue [4 x <4 x i16>] %b.coerce, 2
1513   %b.coerce.fca.3.extract = extractvalue [4 x <4 x i16>] %b.coerce, 3
1514   %0 = bitcast i16* %a to i8*
1515   %vld3_lane = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, <4 x i16> %b.coerce.fca.3.extract, i32 3, i32 2)
1516   %vld3_lane.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 0
1517   %vld3_lane.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 1
1518   %vld3_lane.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 2
1519   %vld3_lane.fca.3.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld3_lane, 3
1520   %.fca.0.0.insert = insertvalue %struct.int16x4x4_t undef, <4 x i16> %vld3_lane.fca.0.extract, 0, 0
1521   %.fca.0.1.insert = insertvalue %struct.int16x4x4_t %.fca.0.0.insert, <4 x i16> %vld3_lane.fca.1.extract, 0, 1
1522   %.fca.0.2.insert = insertvalue %struct.int16x4x4_t %.fca.0.1.insert, <4 x i16> %vld3_lane.fca.2.extract, 0, 2
1523   %.fca.0.3.insert = insertvalue %struct.int16x4x4_t %.fca.0.2.insert, <4 x i16> %vld3_lane.fca.3.extract, 0, 3
1524   ret %struct.int16x4x4_t %.fca.0.3.insert
1525 }
1526
1527 define %struct.int32x2x4_t @test_vld4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) {
1528 ; CHECK-LABEL: test_vld4_lane_s32
1529 ; CHECK: ld4 { {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
1530 entry:
1531   %b.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %b.coerce, 0
1532   %b.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %b.coerce, 1
1533   %b.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %b.coerce, 2
1534   %b.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %b.coerce, 3
1535   %0 = bitcast i32* %a to i8*
1536   %vld3_lane = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, <2 x i32> %b.coerce.fca.3.extract, i32 1, i32 4)
1537   %vld3_lane.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 0
1538   %vld3_lane.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 1
1539   %vld3_lane.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 2
1540   %vld3_lane.fca.3.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld3_lane, 3
1541   %.fca.0.0.insert = insertvalue %struct.int32x2x4_t undef, <2 x i32> %vld3_lane.fca.0.extract, 0, 0
1542   %.fca.0.1.insert = insertvalue %struct.int32x2x4_t %.fca.0.0.insert, <2 x i32> %vld3_lane.fca.1.extract, 0, 1
1543   %.fca.0.2.insert = insertvalue %struct.int32x2x4_t %.fca.0.1.insert, <2 x i32> %vld3_lane.fca.2.extract, 0, 2
1544   %.fca.0.3.insert = insertvalue %struct.int32x2x4_t %.fca.0.2.insert, <2 x i32> %vld3_lane.fca.3.extract, 0, 3
1545   ret %struct.int32x2x4_t %.fca.0.3.insert
1546 }
1547
1548 define %struct.int64x1x4_t @test_vld4_lane_s64(i64* %a, [4 x <1 x i64>] %b.coerce) {
1549 ; CHECK-LABEL: test_vld4_lane_s64
1550 ; CHECK: ld4 { {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
1551 entry:
1552   %b.coerce.fca.0.extract = extractvalue [4 x <1 x i64>] %b.coerce, 0
1553   %b.coerce.fca.1.extract = extractvalue [4 x <1 x i64>] %b.coerce, 1
1554   %b.coerce.fca.2.extract = extractvalue [4 x <1 x i64>] %b.coerce, 2
1555   %b.coerce.fca.3.extract = extractvalue [4 x <1 x i64>] %b.coerce, 3
1556   %0 = bitcast i64* %a to i8*
1557   %vld3_lane = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, <1 x i64> %b.coerce.fca.3.extract, i32 0, i32 8)
1558   %vld3_lane.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 0
1559   %vld3_lane.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 1
1560   %vld3_lane.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 2
1561   %vld3_lane.fca.3.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld3_lane, 3
1562   %.fca.0.0.insert = insertvalue %struct.int64x1x4_t undef, <1 x i64> %vld3_lane.fca.0.extract, 0, 0
1563   %.fca.0.1.insert = insertvalue %struct.int64x1x4_t %.fca.0.0.insert, <1 x i64> %vld3_lane.fca.1.extract, 0, 1
1564   %.fca.0.2.insert = insertvalue %struct.int64x1x4_t %.fca.0.1.insert, <1 x i64> %vld3_lane.fca.2.extract, 0, 2
1565   %.fca.0.3.insert = insertvalue %struct.int64x1x4_t %.fca.0.2.insert, <1 x i64> %vld3_lane.fca.3.extract, 0, 3
1566   ret %struct.int64x1x4_t %.fca.0.3.insert
1567 }
1568
1569 define %struct.float32x2x4_t @test_vld4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) {
1570 ; CHECK-LABEL: test_vld4_lane_f32
1571 ; CHECK: ld4 { {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
1572 entry:
1573   %b.coerce.fca.0.extract = extractvalue [4 x <2 x float>] %b.coerce, 0
1574   %b.coerce.fca.1.extract = extractvalue [4 x <2 x float>] %b.coerce, 1
1575   %b.coerce.fca.2.extract = extractvalue [4 x <2 x float>] %b.coerce, 2
1576   %b.coerce.fca.3.extract = extractvalue [4 x <2 x float>] %b.coerce, 3
1577   %0 = bitcast float* %a to i8*
1578   %vld3_lane = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, <2 x float> %b.coerce.fca.3.extract, i32 1, i32 4)
1579   %vld3_lane.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 0
1580   %vld3_lane.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 1
1581   %vld3_lane.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 2
1582   %vld3_lane.fca.3.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld3_lane, 3
1583   %.fca.0.0.insert = insertvalue %struct.float32x2x4_t undef, <2 x float> %vld3_lane.fca.0.extract, 0, 0
1584   %.fca.0.1.insert = insertvalue %struct.float32x2x4_t %.fca.0.0.insert, <2 x float> %vld3_lane.fca.1.extract, 0, 1
1585   %.fca.0.2.insert = insertvalue %struct.float32x2x4_t %.fca.0.1.insert, <2 x float> %vld3_lane.fca.2.extract, 0, 2
1586   %.fca.0.3.insert = insertvalue %struct.float32x2x4_t %.fca.0.2.insert, <2 x float> %vld3_lane.fca.3.extract, 0, 3
1587   ret %struct.float32x2x4_t %.fca.0.3.insert
1588 }
1589
1590 define %struct.float64x1x4_t @test_vld4_lane_f64(double* %a, [4 x <1 x double>] %b.coerce) {
1591 ; CHECK-LABEL: test_vld4_lane_f64
1592 ; CHECK: ld4 { {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
1593 entry:
1594   %b.coerce.fca.0.extract = extractvalue [4 x <1 x double>] %b.coerce, 0
1595   %b.coerce.fca.1.extract = extractvalue [4 x <1 x double>] %b.coerce, 1
1596   %b.coerce.fca.2.extract = extractvalue [4 x <1 x double>] %b.coerce, 2
1597   %b.coerce.fca.3.extract = extractvalue [4 x <1 x double>] %b.coerce, 3
1598   %0 = bitcast double* %a to i8*
1599   %vld3_lane = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, <1 x double> %b.coerce.fca.3.extract, i32 0, i32 8)
1600   %vld3_lane.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 0
1601   %vld3_lane.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 1
1602   %vld3_lane.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 2
1603   %vld3_lane.fca.3.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld3_lane, 3
1604   %.fca.0.0.insert = insertvalue %struct.float64x1x4_t undef, <1 x double> %vld3_lane.fca.0.extract, 0, 0
1605   %.fca.0.1.insert = insertvalue %struct.float64x1x4_t %.fca.0.0.insert, <1 x double> %vld3_lane.fca.1.extract, 0, 1
1606   %.fca.0.2.insert = insertvalue %struct.float64x1x4_t %.fca.0.1.insert, <1 x double> %vld3_lane.fca.2.extract, 0, 2
1607   %.fca.0.3.insert = insertvalue %struct.float64x1x4_t %.fca.0.2.insert, <1 x double> %vld3_lane.fca.3.extract, 0, 3
1608   ret %struct.float64x1x4_t %.fca.0.3.insert
1609 }
1610
1611 define void @test_vst1q_lane_s8(i8* %a, <16 x i8> %b) {
1612 ; CHECK-LABEL: test_vst1q_lane_s8
1613 ; CHECK: st1 { {{v[0-9]+}}.b }[{{[0-9]+}}], [x0]
1614 entry:
1615   %0 = extractelement <16 x i8> %b, i32 15
1616   store i8 %0, i8* %a, align 1
1617   ret void
1618 }
1619
1620 define void @test_vst1q_lane_s16(i16* %a, <8 x i16> %b) {
1621 ; CHECK-LABEL: test_vst1q_lane_s16
1622 ; CHECK: st1 { {{v[0-9]+}}.h }[{{[0-9]+}}], [x0]
1623 entry:
1624   %0 = extractelement <8 x i16> %b, i32 7
1625   store i16 %0, i16* %a, align 2
1626   ret void
1627 }
1628
1629 define void @test_vst1q_lane_s32(i32* %a, <4 x i32> %b) {
1630 ; CHECK-LABEL: test_vst1q_lane_s32
1631 ; CHECK: st1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
1632 entry:
1633   %0 = extractelement <4 x i32> %b, i32 3
1634   store i32 %0, i32* %a, align 4
1635   ret void
1636 }
1637
1638 define void @test_vst1q_lane_s64(i64* %a, <2 x i64> %b) {
1639 ; CHECK-LABEL: test_vst1q_lane_s64
1640 ; CHECK: st1 { {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
1641 entry:
1642   %0 = extractelement <2 x i64> %b, i32 1
1643   store i64 %0, i64* %a, align 8
1644   ret void
1645 }
1646
1647 define void @test_vst1q_lane_f32(float* %a, <4 x float> %b) {
1648 ; CHECK-LABEL: test_vst1q_lane_f32
1649 ; CHECK: st1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
1650 entry:
1651   %0 = extractelement <4 x float> %b, i32 3
1652   store float %0, float* %a, align 4
1653   ret void
1654 }
1655
1656 define void @test_vst1q_lane_f64(double* %a, <2 x double> %b) {
1657 ; CHECK-LABEL: test_vst1q_lane_f64
1658 ; CHECK: st1 { {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
1659 entry:
1660   %0 = extractelement <2 x double> %b, i32 1
1661   store double %0, double* %a, align 8
1662   ret void
1663 }
1664
1665 define void @test_vst1_lane_s8(i8* %a, <8 x i8> %b) {
1666 ; CHECK-LABEL: test_vst1_lane_s8
1667 ; CHECK: st1 { {{v[0-9]+}}.b }[{{[0-9]+}}], [x0]
1668 entry:
1669   %0 = extractelement <8 x i8> %b, i32 7
1670   store i8 %0, i8* %a, align 1
1671   ret void
1672 }
1673
1674 define void @test_vst1_lane_s16(i16* %a, <4 x i16> %b) {
1675 ; CHECK-LABEL: test_vst1_lane_s16
1676 ; CHECK: st1 { {{v[0-9]+}}.h }[{{[0-9]+}}], [x0]
1677 entry:
1678   %0 = extractelement <4 x i16> %b, i32 3
1679   store i16 %0, i16* %a, align 2
1680   ret void
1681 }
1682
1683 define void @test_vst1_lane_s32(i32* %a, <2 x i32> %b) {
1684 ; CHECK-LABEL: test_vst1_lane_s32
1685 ; CHECK: st1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
1686 entry:
1687   %0 = extractelement <2 x i32> %b, i32 1
1688   store i32 %0, i32* %a, align 4
1689   ret void
1690 }
1691
1692 define void @test_vst1_lane_s64(i64* %a, <1 x i64> %b) {
1693 ; CHECK-LABEL: test_vst1_lane_s64
1694 ; CHECK: st1 { {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
1695 entry:
1696   %0 = extractelement <1 x i64> %b, i32 0
1697   store i64 %0, i64* %a, align 8
1698   ret void
1699 }
1700
1701 define void @test_vst1_lane_f32(float* %a, <2 x float> %b) {
1702 ; CHECK-LABEL: test_vst1_lane_f32
1703 ; CHECK: st1 { {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
1704 entry:
1705   %0 = extractelement <2 x float> %b, i32 1
1706   store float %0, float* %a, align 4
1707   ret void
1708 }
1709
1710 define void @test_vst1_lane_f64(double* %a, <1 x double> %b) {
1711 ; CHECK-LABEL: test_vst1_lane_f64
1712 ; CHECK: st1 { {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
1713 entry:
1714   %0 = extractelement <1 x double> %b, i32 0
1715   store double %0, double* %a, align 8
1716   ret void
1717 }
1718
1719 define void @test_vst2q_lane_s8(i8* %a, [2 x <16 x i8>] %b.coerce) {
1720 ; CHECK-LABEL: test_vst2q_lane_s8
1721 ; CHECK: st2 { {{v[0-9]+}}.b, {{v[0-9]+}}.b }[{{[0-9]+}}], [x0]
1722 entry:
1723   %b.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %b.coerce, 0
1724   %b.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %b.coerce, 1
1725   tail call void @llvm.arm.neon.vst2lane.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, i32 15, i32 1)
1726   ret void
1727 }
1728
1729 define void @test_vst2q_lane_s16(i16* %a, [2 x <8 x i16>] %b.coerce) {
1730 ; CHECK-LABEL: test_vst2q_lane_s16
1731 ; CHECK: st2 { {{v[0-9]+}}.h, {{v[0-9]+}}.h }[{{[0-9]+}}], [x0]
1732 entry:
1733   %b.coerce.fca.0.extract = extractvalue [2 x <8 x i16>] %b.coerce, 0
1734   %b.coerce.fca.1.extract = extractvalue [2 x <8 x i16>] %b.coerce, 1
1735   %0 = bitcast i16* %a to i8*
1736   tail call void @llvm.arm.neon.vst2lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, i32 7, i32 2)
1737   ret void
1738 }
1739
1740 define void @test_vst2q_lane_s32(i32* %a, [2 x <4 x i32>] %b.coerce) {
1741 ; CHECK-LABEL: test_vst2q_lane_s32
1742 ; CHECK: st2 { {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
1743 entry:
1744   %b.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %b.coerce, 0
1745   %b.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %b.coerce, 1
1746   %0 = bitcast i32* %a to i8*
1747   tail call void @llvm.arm.neon.vst2lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, i32 3, i32 4)
1748   ret void
1749 }
1750
1751 define void @test_vst2q_lane_s64(i64* %a, [2 x <2 x i64>] %b.coerce) {
1752 ; CHECK-LABEL: test_vst2q_lane_s64
1753 ; CHECK: st2 { {{v[0-9]+}}.d, {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
1754 entry:
1755   %b.coerce.fca.0.extract = extractvalue [2 x <2 x i64>] %b.coerce, 0
1756   %b.coerce.fca.1.extract = extractvalue [2 x <2 x i64>] %b.coerce, 1
1757   %0 = bitcast i64* %a to i8*
1758   tail call void @llvm.arm.neon.vst2lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, i32 1, i32 8)
1759   ret void
1760 }
1761
1762 define void @test_vst2q_lane_f32(float* %a, [2 x <4 x float>] %b.coerce) {
1763 ; CHECK-LABEL: test_vst2q_lane_f32
1764 ; CHECK: st2 { {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
1765 entry:
1766   %b.coerce.fca.0.extract = extractvalue [2 x <4 x float>] %b.coerce, 0
1767   %b.coerce.fca.1.extract = extractvalue [2 x <4 x float>] %b.coerce, 1
1768   %0 = bitcast float* %a to i8*
1769   tail call void @llvm.arm.neon.vst2lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, i32 3, i32 4)
1770   ret void
1771 }
1772
1773 define void @test_vst2q_lane_f64(double* %a, [2 x <2 x double>] %b.coerce) {
1774 ; CHECK-LABEL: test_vst2q_lane_f64
1775 ; CHECK: st2 { {{v[0-9]+}}.d, {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
1776 entry:
1777   %b.coerce.fca.0.extract = extractvalue [2 x <2 x double>] %b.coerce, 0
1778   %b.coerce.fca.1.extract = extractvalue [2 x <2 x double>] %b.coerce, 1
1779   %0 = bitcast double* %a to i8*
1780   tail call void @llvm.arm.neon.vst2lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, i32 1, i32 8)
1781   ret void
1782 }
1783
1784 define void @test_vst2_lane_s8(i8* %a, [2 x <8 x i8>] %b.coerce) {
1785 ; CHECK-LABEL: test_vst2_lane_s8
1786 ; CHECK: st2 { {{v[0-9]+}}.b, {{v[0-9]+}}.b }[{{[0-9]+}}], [x0]
1787 entry:
1788   %b.coerce.fca.0.extract = extractvalue [2 x <8 x i8>] %b.coerce, 0
1789   %b.coerce.fca.1.extract = extractvalue [2 x <8 x i8>] %b.coerce, 1
1790   tail call void @llvm.arm.neon.vst2lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, i32 7, i32 1)
1791   ret void
1792 }
1793
1794 define void @test_vst2_lane_s16(i16* %a, [2 x <4 x i16>] %b.coerce) {
1795 ; CHECK-LABEL: test_vst2_lane_s16
1796 ; CHECK: st2 { {{v[0-9]+}}.h, {{v[0-9]+}}.h }[{{[0-9]+}}], [x0]
1797 entry:
1798   %b.coerce.fca.0.extract = extractvalue [2 x <4 x i16>] %b.coerce, 0
1799   %b.coerce.fca.1.extract = extractvalue [2 x <4 x i16>] %b.coerce, 1
1800   %0 = bitcast i16* %a to i8*
1801   tail call void @llvm.arm.neon.vst2lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, i32 3, i32 2)
1802   ret void
1803 }
1804
1805 define void @test_vst2_lane_s32(i32* %a, [2 x <2 x i32>] %b.coerce) {
1806 ; CHECK-LABEL: test_vst2_lane_s32
1807 ; CHECK: st2 { {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
1808 entry:
1809   %b.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %b.coerce, 0
1810   %b.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %b.coerce, 1
1811   %0 = bitcast i32* %a to i8*
1812   tail call void @llvm.arm.neon.vst2lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, i32 1, i32 4)
1813   ret void
1814 }
1815
1816 define void @test_vst2_lane_s64(i64* %a, [2 x <1 x i64>] %b.coerce) {
1817 ; CHECK-LABEL: test_vst2_lane_s64
1818 ; CHECK: st2 { {{v[0-9]+}}.d, {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
1819 entry:
1820   %b.coerce.fca.0.extract = extractvalue [2 x <1 x i64>] %b.coerce, 0
1821   %b.coerce.fca.1.extract = extractvalue [2 x <1 x i64>] %b.coerce, 1
1822   %0 = bitcast i64* %a to i8*
1823   tail call void @llvm.arm.neon.vst2lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, i32 0, i32 8)
1824   ret void
1825 }
1826
1827 define void @test_vst2_lane_f32(float* %a, [2 x <2 x float>] %b.coerce) {
1828 ; CHECK-LABEL: test_vst2_lane_f32
1829 ; CHECK: st2 { {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
1830 entry:
1831   %b.coerce.fca.0.extract = extractvalue [2 x <2 x float>] %b.coerce, 0
1832   %b.coerce.fca.1.extract = extractvalue [2 x <2 x float>] %b.coerce, 1
1833   %0 = bitcast float* %a to i8*
1834   tail call void @llvm.arm.neon.vst2lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, i32 1, i32 4)
1835   ret void
1836 }
1837
1838 define void @test_vst2_lane_f64(double* %a, [2 x <1 x double>] %b.coerce) {
1839 ; CHECK-LABEL: test_vst2_lane_f64
1840 ; CHECK: st2 { {{v[0-9]+}}.d, {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
1841 entry:
1842   %b.coerce.fca.0.extract = extractvalue [2 x <1 x double>] %b.coerce, 0
1843   %b.coerce.fca.1.extract = extractvalue [2 x <1 x double>] %b.coerce, 1
1844   %0 = bitcast double* %a to i8*
1845   tail call void @llvm.arm.neon.vst2lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, i32 0, i32 8)
1846   ret void
1847 }
1848
1849 define void @test_vst3q_lane_s8(i8* %a, [3 x <16 x i8>] %b.coerce) {
1850 ; CHECK-LABEL: test_vst3q_lane_s8
1851 ; CHECK: st3 { {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b }[{{[0-9]+}}], [x0]
1852 entry:
1853   %b.coerce.fca.0.extract = extractvalue [3 x <16 x i8>] %b.coerce, 0
1854   %b.coerce.fca.1.extract = extractvalue [3 x <16 x i8>] %b.coerce, 1
1855   %b.coerce.fca.2.extract = extractvalue [3 x <16 x i8>] %b.coerce, 2
1856   tail call void @llvm.arm.neon.vst3lane.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, i32 15, i32 1)
1857   ret void
1858 }
1859
1860 define void @test_vst3q_lane_s16(i16* %a, [3 x <8 x i16>] %b.coerce) {
1861 ; CHECK-LABEL: test_vst3q_lane_s16
1862 ; CHECK: st3 { {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h }[{{[0-9]+}}], [x0]
1863 entry:
1864   %b.coerce.fca.0.extract = extractvalue [3 x <8 x i16>] %b.coerce, 0
1865   %b.coerce.fca.1.extract = extractvalue [3 x <8 x i16>] %b.coerce, 1
1866   %b.coerce.fca.2.extract = extractvalue [3 x <8 x i16>] %b.coerce, 2
1867   %0 = bitcast i16* %a to i8*
1868   tail call void @llvm.arm.neon.vst3lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, i32 7, i32 2)
1869   ret void
1870 }
1871
1872 define void @test_vst3q_lane_s32(i32* %a, [3 x <4 x i32>] %b.coerce) {
1873 ; CHECK-LABEL: test_vst3q_lane_s32
1874 ; CHECK: st3 { {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
1875 entry:
1876   %b.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %b.coerce, 0
1877   %b.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %b.coerce, 1
1878   %b.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %b.coerce, 2
1879   %0 = bitcast i32* %a to i8*
1880   tail call void @llvm.arm.neon.vst3lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, i32 3, i32 4)
1881   ret void
1882 }
1883
1884 define void @test_vst3q_lane_s64(i64* %a, [3 x <2 x i64>] %b.coerce) {
1885 ; CHECK-LABEL: test_vst3q_lane_s64
1886 ; CHECK: st3 { {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
1887 entry:
1888   %b.coerce.fca.0.extract = extractvalue [3 x <2 x i64>] %b.coerce, 0
1889   %b.coerce.fca.1.extract = extractvalue [3 x <2 x i64>] %b.coerce, 1
1890   %b.coerce.fca.2.extract = extractvalue [3 x <2 x i64>] %b.coerce, 2
1891   %0 = bitcast i64* %a to i8*
1892   tail call void @llvm.arm.neon.vst3lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, i32 1, i32 8)
1893   ret void
1894 }
1895
1896 define void @test_vst3q_lane_f32(float* %a, [3 x <4 x float>] %b.coerce) {
1897 ; CHECK-LABEL: test_vst3q_lane_f32
1898 ; CHECK: st3 { {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
1899 entry:
1900   %b.coerce.fca.0.extract = extractvalue [3 x <4 x float>] %b.coerce, 0
1901   %b.coerce.fca.1.extract = extractvalue [3 x <4 x float>] %b.coerce, 1
1902   %b.coerce.fca.2.extract = extractvalue [3 x <4 x float>] %b.coerce, 2
1903   %0 = bitcast float* %a to i8*
1904   tail call void @llvm.arm.neon.vst3lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, i32 3, i32 4)
1905   ret void
1906 }
1907
1908 define void @test_vst3q_lane_f64(double* %a, [3 x <2 x double>] %b.coerce) {
1909 ; CHECK-LABEL: test_vst3q_lane_f64
1910 ; CHECK: st3 { {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
1911 entry:
1912   %b.coerce.fca.0.extract = extractvalue [3 x <2 x double>] %b.coerce, 0
1913   %b.coerce.fca.1.extract = extractvalue [3 x <2 x double>] %b.coerce, 1
1914   %b.coerce.fca.2.extract = extractvalue [3 x <2 x double>] %b.coerce, 2
1915   %0 = bitcast double* %a to i8*
1916   tail call void @llvm.arm.neon.vst3lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, i32 1, i32 8)
1917   ret void
1918 }
1919
1920 define void @test_vst3_lane_s8(i8* %a, [3 x <8 x i8>] %b.coerce) {
1921 ; CHECK-LABEL: test_vst3_lane_s8
1922 ; CHECK: st3 { {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b }[{{[0-9]+}}], [x0]
1923 entry:
1924   %b.coerce.fca.0.extract = extractvalue [3 x <8 x i8>] %b.coerce, 0
1925   %b.coerce.fca.1.extract = extractvalue [3 x <8 x i8>] %b.coerce, 1
1926   %b.coerce.fca.2.extract = extractvalue [3 x <8 x i8>] %b.coerce, 2
1927   tail call void @llvm.arm.neon.vst3lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, i32 7, i32 1)
1928   ret void
1929 }
1930
1931 define void @test_vst3_lane_s16(i16* %a, [3 x <4 x i16>] %b.coerce) {
1932 ; CHECK-LABEL: test_vst3_lane_s16
1933 ; CHECK: st3 { {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h }[{{[0-9]+}}], [x0]
1934 entry:
1935   %b.coerce.fca.0.extract = extractvalue [3 x <4 x i16>] %b.coerce, 0
1936   %b.coerce.fca.1.extract = extractvalue [3 x <4 x i16>] %b.coerce, 1
1937   %b.coerce.fca.2.extract = extractvalue [3 x <4 x i16>] %b.coerce, 2
1938   %0 = bitcast i16* %a to i8*
1939   tail call void @llvm.arm.neon.vst3lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, i32 3, i32 2)
1940   ret void
1941 }
1942
1943 define void @test_vst3_lane_s32(i32* %a, [3 x <2 x i32>] %b.coerce) {
1944 ; CHECK-LABEL: test_vst3_lane_s32
1945 ; CHECK: st3 { {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
1946 entry:
1947   %b.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %b.coerce, 0
1948   %b.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %b.coerce, 1
1949   %b.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %b.coerce, 2
1950   %0 = bitcast i32* %a to i8*
1951   tail call void @llvm.arm.neon.vst3lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, i32 1, i32 4)
1952   ret void
1953 }
1954
1955 define void @test_vst3_lane_s64(i64* %a, [3 x <1 x i64>] %b.coerce) {
1956 ; CHECK-LABEL: test_vst3_lane_s64
1957 ; CHECK: st3 { {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
1958 entry:
1959   %b.coerce.fca.0.extract = extractvalue [3 x <1 x i64>] %b.coerce, 0
1960   %b.coerce.fca.1.extract = extractvalue [3 x <1 x i64>] %b.coerce, 1
1961   %b.coerce.fca.2.extract = extractvalue [3 x <1 x i64>] %b.coerce, 2
1962   %0 = bitcast i64* %a to i8*
1963   tail call void @llvm.arm.neon.vst3lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, i32 0, i32 8)
1964   ret void
1965 }
1966
1967 define void @test_vst3_lane_f32(float* %a, [3 x <2 x float>] %b.coerce) {
1968 ; CHECK-LABEL: test_vst3_lane_f32
1969 ; CHECK: st3 { {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
1970 entry:
1971   %b.coerce.fca.0.extract = extractvalue [3 x <2 x float>] %b.coerce, 0
1972   %b.coerce.fca.1.extract = extractvalue [3 x <2 x float>] %b.coerce, 1
1973   %b.coerce.fca.2.extract = extractvalue [3 x <2 x float>] %b.coerce, 2
1974   %0 = bitcast float* %a to i8*
1975   tail call void @llvm.arm.neon.vst3lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, i32 1, i32 4)
1976   ret void
1977 }
1978
1979 define void @test_vst3_lane_f64(double* %a, [3 x <1 x double>] %b.coerce) {
1980 ; CHECK-LABEL: test_vst3_lane_f64
1981 ; CHECK: st3 { {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
1982 entry:
1983   %b.coerce.fca.0.extract = extractvalue [3 x <1 x double>] %b.coerce, 0
1984   %b.coerce.fca.1.extract = extractvalue [3 x <1 x double>] %b.coerce, 1
1985   %b.coerce.fca.2.extract = extractvalue [3 x <1 x double>] %b.coerce, 2
1986   %0 = bitcast double* %a to i8*
1987   tail call void @llvm.arm.neon.vst3lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, i32 0, i32 8)
1988   ret void
1989 }
1990
1991 define void @test_vst4q_lane_s8(i16* %a, [4 x <16 x i8>] %b.coerce) {
1992 ; CHECK-LABEL: test_vst4q_lane_s8
1993 ; CHECK: st4 { {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b }[{{[0-9]+}}], [x0]
1994 entry:
1995   %b.coerce.fca.0.extract = extractvalue [4 x <16 x i8>] %b.coerce, 0
1996   %b.coerce.fca.1.extract = extractvalue [4 x <16 x i8>] %b.coerce, 1
1997   %b.coerce.fca.2.extract = extractvalue [4 x <16 x i8>] %b.coerce, 2
1998   %b.coerce.fca.3.extract = extractvalue [4 x <16 x i8>] %b.coerce, 3
1999   %0 = bitcast i16* %a to i8*
2000   tail call void @llvm.arm.neon.vst4lane.v16i8(i8* %0, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, <16 x i8> %b.coerce.fca.3.extract, i32 15, i32 2)
2001   ret void
2002 }
2003
2004 define void @test_vst4q_lane_s16(i16* %a, [4 x <8 x i16>] %b.coerce) {
2005 ; CHECK-LABEL: test_vst4q_lane_s16
2006 ; CHECK: st4 { {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h }[{{[0-9]+}}], [x0]
2007 entry:
2008   %b.coerce.fca.0.extract = extractvalue [4 x <8 x i16>] %b.coerce, 0
2009   %b.coerce.fca.1.extract = extractvalue [4 x <8 x i16>] %b.coerce, 1
2010   %b.coerce.fca.2.extract = extractvalue [4 x <8 x i16>] %b.coerce, 2
2011   %b.coerce.fca.3.extract = extractvalue [4 x <8 x i16>] %b.coerce, 3
2012   %0 = bitcast i16* %a to i8*
2013   tail call void @llvm.arm.neon.vst4lane.v8i16(i8* %0, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, <8 x i16> %b.coerce.fca.3.extract, i32 7, i32 2)
2014   ret void
2015 }
2016
2017 define void @test_vst4q_lane_s32(i32* %a, [4 x <4 x i32>] %b.coerce) {
2018 ; CHECK-LABEL: test_vst4q_lane_s32
2019 ; CHECK: st4 { {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
2020 entry:
2021   %b.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %b.coerce, 0
2022   %b.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %b.coerce, 1
2023   %b.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %b.coerce, 2
2024   %b.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %b.coerce, 3
2025   %0 = bitcast i32* %a to i8*
2026   tail call void @llvm.arm.neon.vst4lane.v4i32(i8* %0, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, <4 x i32> %b.coerce.fca.3.extract, i32 3, i32 4)
2027   ret void
2028 }
2029
2030 define void @test_vst4q_lane_s64(i64* %a, [4 x <2 x i64>] %b.coerce) {
2031 ; CHECK-LABEL: test_vst4q_lane_s64
2032 ; CHECK: st4 { {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
2033 entry:
2034   %b.coerce.fca.0.extract = extractvalue [4 x <2 x i64>] %b.coerce, 0
2035   %b.coerce.fca.1.extract = extractvalue [4 x <2 x i64>] %b.coerce, 1
2036   %b.coerce.fca.2.extract = extractvalue [4 x <2 x i64>] %b.coerce, 2
2037   %b.coerce.fca.3.extract = extractvalue [4 x <2 x i64>] %b.coerce, 3
2038   %0 = bitcast i64* %a to i8*
2039   tail call void @llvm.arm.neon.vst4lane.v2i64(i8* %0, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, <2 x i64> %b.coerce.fca.3.extract, i32 1, i32 8)
2040   ret void
2041 }
2042
2043 define void @test_vst4q_lane_f32(float* %a, [4 x <4 x float>] %b.coerce) {
2044 ; CHECK-LABEL: test_vst4q_lane_f32
2045 ; CHECK: st4 { {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
2046 entry:
2047   %b.coerce.fca.0.extract = extractvalue [4 x <4 x float>] %b.coerce, 0
2048   %b.coerce.fca.1.extract = extractvalue [4 x <4 x float>] %b.coerce, 1
2049   %b.coerce.fca.2.extract = extractvalue [4 x <4 x float>] %b.coerce, 2
2050   %b.coerce.fca.3.extract = extractvalue [4 x <4 x float>] %b.coerce, 3
2051   %0 = bitcast float* %a to i8*
2052   tail call void @llvm.arm.neon.vst4lane.v4f32(i8* %0, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, <4 x float> %b.coerce.fca.3.extract, i32 3, i32 4)
2053   ret void
2054 }
2055
2056 define void @test_vst4q_lane_f64(double* %a, [4 x <2 x double>] %b.coerce) {
2057 ; CHECK-LABEL: test_vst4q_lane_f64
2058 ; CHECK: st4 { {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
2059 entry:
2060   %b.coerce.fca.0.extract = extractvalue [4 x <2 x double>] %b.coerce, 0
2061   %b.coerce.fca.1.extract = extractvalue [4 x <2 x double>] %b.coerce, 1
2062   %b.coerce.fca.2.extract = extractvalue [4 x <2 x double>] %b.coerce, 2
2063   %b.coerce.fca.3.extract = extractvalue [4 x <2 x double>] %b.coerce, 3
2064   %0 = bitcast double* %a to i8*
2065   tail call void @llvm.arm.neon.vst4lane.v2f64(i8* %0, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, <2 x double> %b.coerce.fca.3.extract, i32 1, i32 8)
2066   ret void
2067 }
2068
2069 define void @test_vst4_lane_s8(i8* %a, [4 x <8 x i8>] %b.coerce) {
2070 ; CHECK-LABEL: test_vst4_lane_s8
2071 ; CHECK: st4 { {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b }[{{[0-9]+}}], [x0]
2072 entry:
2073   %b.coerce.fca.0.extract = extractvalue [4 x <8 x i8>] %b.coerce, 0
2074   %b.coerce.fca.1.extract = extractvalue [4 x <8 x i8>] %b.coerce, 1
2075   %b.coerce.fca.2.extract = extractvalue [4 x <8 x i8>] %b.coerce, 2
2076   %b.coerce.fca.3.extract = extractvalue [4 x <8 x i8>] %b.coerce, 3
2077   tail call void @llvm.arm.neon.vst4lane.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, <8 x i8> %b.coerce.fca.3.extract, i32 7, i32 1)
2078   ret void
2079 }
2080
2081 define void @test_vst4_lane_s16(i16* %a, [4 x <4 x i16>] %b.coerce) {
2082 ; CHECK-LABEL: test_vst4_lane_s16
2083 ; CHECK: st4 { {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h, {{v[0-9]+}}.h }[{{[0-9]+}}], [x0]
2084 entry:
2085   %b.coerce.fca.0.extract = extractvalue [4 x <4 x i16>] %b.coerce, 0
2086   %b.coerce.fca.1.extract = extractvalue [4 x <4 x i16>] %b.coerce, 1
2087   %b.coerce.fca.2.extract = extractvalue [4 x <4 x i16>] %b.coerce, 2
2088   %b.coerce.fca.3.extract = extractvalue [4 x <4 x i16>] %b.coerce, 3
2089   %0 = bitcast i16* %a to i8*
2090   tail call void @llvm.arm.neon.vst4lane.v4i16(i8* %0, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, <4 x i16> %b.coerce.fca.3.extract, i32 3, i32 2)
2091   ret void
2092 }
2093
2094 define void @test_vst4_lane_s32(i32* %a, [4 x <2 x i32>] %b.coerce) {
2095 ; CHECK-LABEL: test_vst4_lane_s32
2096 ; CHECK: st4 { {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
2097 entry:
2098   %b.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %b.coerce, 0
2099   %b.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %b.coerce, 1
2100   %b.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %b.coerce, 2
2101   %b.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %b.coerce, 3
2102   %0 = bitcast i32* %a to i8*
2103   tail call void @llvm.arm.neon.vst4lane.v2i32(i8* %0, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, <2 x i32> %b.coerce.fca.3.extract, i32 1, i32 4)
2104   ret void
2105 }
2106
2107 define void @test_vst4_lane_s64(i64* %a, [4 x <1 x i64>] %b.coerce) {
2108 ; CHECK-LABEL: test_vst4_lane_s64
2109 ; CHECK: st4 { {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
2110 entry:
2111   %b.coerce.fca.0.extract = extractvalue [4 x <1 x i64>] %b.coerce, 0
2112   %b.coerce.fca.1.extract = extractvalue [4 x <1 x i64>] %b.coerce, 1
2113   %b.coerce.fca.2.extract = extractvalue [4 x <1 x i64>] %b.coerce, 2
2114   %b.coerce.fca.3.extract = extractvalue [4 x <1 x i64>] %b.coerce, 3
2115   %0 = bitcast i64* %a to i8*
2116   tail call void @llvm.arm.neon.vst4lane.v1i64(i8* %0, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, <1 x i64> %b.coerce.fca.3.extract, i32 0, i32 8)
2117   ret void
2118 }
2119
2120 define void @test_vst4_lane_f32(float* %a, [4 x <2 x float>] %b.coerce) {
2121 ; CHECK-LABEL: test_vst4_lane_f32
2122 ; CHECK: st4 { {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s, {{v[0-9]+}}.s }[{{[0-9]+}}], [x0]
2123 entry:
2124   %b.coerce.fca.0.extract = extractvalue [4 x <2 x float>] %b.coerce, 0
2125   %b.coerce.fca.1.extract = extractvalue [4 x <2 x float>] %b.coerce, 1
2126   %b.coerce.fca.2.extract = extractvalue [4 x <2 x float>] %b.coerce, 2
2127   %b.coerce.fca.3.extract = extractvalue [4 x <2 x float>] %b.coerce, 3
2128   %0 = bitcast float* %a to i8*
2129   tail call void @llvm.arm.neon.vst4lane.v2f32(i8* %0, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, <2 x float> %b.coerce.fca.3.extract, i32 1, i32 4)
2130   ret void
2131 }
2132
2133 define void @test_vst4_lane_f64(double* %a, [4 x <1 x double>] %b.coerce) {
2134 ; CHECK-LABEL: test_vst4_lane_f64
2135 ; CHECK: st4 { {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d, {{v[0-9]+}}.d }[{{[0-9]+}}], [x0]
2136 entry:
2137   %b.coerce.fca.0.extract = extractvalue [4 x <1 x double>] %b.coerce, 0
2138   %b.coerce.fca.1.extract = extractvalue [4 x <1 x double>] %b.coerce, 1
2139   %b.coerce.fca.2.extract = extractvalue [4 x <1 x double>] %b.coerce, 2
2140   %b.coerce.fca.3.extract = extractvalue [4 x <1 x double>] %b.coerce, 3
2141   %0 = bitcast double* %a to i8*
2142   tail call void @llvm.arm.neon.vst4lane.v1f64(i8* %0, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, <1 x double> %b.coerce.fca.3.extract, i32 0, i32 8)
2143   ret void
2144 }
2145
2146 declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8*, <16 x i8>, <16 x i8>, i32, i32)
2147 declare { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32)
2148 declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32)
2149 declare { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2lane.v2i64(i8*, <2 x i64>, <2 x i64>, i32, i32)
2150 declare { <4 x float>, <4 x float> } @llvm.arm.neon.vld2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32)
2151 declare { <2 x double>, <2 x double> } @llvm.arm.neon.vld2lane.v2f64(i8*, <2 x double>, <2 x double>, i32, i32)
2152 declare { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32)
2153 declare { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32)
2154 declare { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32)
2155 declare { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8*, i32)
2156 declare { <2 x float>, <2 x float> } @llvm.arm.neon.vld2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32)
2157 declare { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8*, i32)
2158 declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
2159 declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
2160 declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
2161 declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
2162 declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32)
2163 declare { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32, i32)
2164 declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
2165 declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
2166 declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
2167 declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8*, i32)
2168 declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32)
2169 declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8*, i32)
2170 declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
2171 declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
2172 declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
2173 declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
2174 declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32)
2175 declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32)
2176 declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
2177 declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
2178 declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
2179 declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8*, i32)
2180 declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32)
2181 declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8*, i32)
2182 declare { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2lane.v1i64(i8*, <1 x i64>, <1 x i64>, i32, i32)
2183 declare { <1 x double>, <1 x double> } @llvm.arm.neon.vld2lane.v1f64(i8*, <1 x double>, <1 x double>, i32, i32)
2184 declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
2185 declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32, i32)
2186 declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
2187 declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32, i32)
2188 declare void @llvm.arm.neon.vst2lane.v16i8(i8*, <16 x i8>, <16 x i8>, i32, i32)
2189 declare void @llvm.arm.neon.vst2lane.v8i16(i8*, <8 x i16>, <8 x i16>, i32, i32)
2190 declare void @llvm.arm.neon.vst2lane.v4i32(i8*, <4 x i32>, <4 x i32>, i32, i32)
2191 declare void @llvm.arm.neon.vst2lane.v2i64(i8*, <2 x i64>, <2 x i64>, i32, i32)
2192 declare void @llvm.arm.neon.vst2lane.v4f32(i8*, <4 x float>, <4 x float>, i32, i32)
2193 declare void @llvm.arm.neon.vst2lane.v2f64(i8*, <2 x double>, <2 x double>, i32, i32)
2194 declare void @llvm.arm.neon.vst2lane.v8i8(i8*, <8 x i8>, <8 x i8>, i32, i32)
2195 declare void @llvm.arm.neon.vst2lane.v4i16(i8*, <4 x i16>, <4 x i16>, i32, i32)
2196 declare void @llvm.arm.neon.vst2lane.v2i32(i8*, <2 x i32>, <2 x i32>, i32, i32)
2197 declare void @llvm.arm.neon.vst2lane.v1i64(i8*, <1 x i64>, <1 x i64>, i32, i32)
2198 declare void @llvm.arm.neon.vst2lane.v2f32(i8*, <2 x float>, <2 x float>, i32, i32)
2199 declare void @llvm.arm.neon.vst2lane.v1f64(i8*, <1 x double>, <1 x double>, i32, i32)
2200 declare void @llvm.arm.neon.vst3lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
2201 declare void @llvm.arm.neon.vst3lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
2202 declare void @llvm.arm.neon.vst3lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
2203 declare void @llvm.arm.neon.vst3lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
2204 declare void @llvm.arm.neon.vst3lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32, i32)
2205 declare void @llvm.arm.neon.vst3lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32, i32)
2206 declare void @llvm.arm.neon.vst3lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
2207 declare void @llvm.arm.neon.vst3lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
2208 declare void @llvm.arm.neon.vst3lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
2209 declare void @llvm.arm.neon.vst3lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
2210 declare void @llvm.arm.neon.vst3lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32, i32)
2211 declare void @llvm.arm.neon.vst3lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32, i32)
2212 declare void @llvm.arm.neon.vst4lane.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32, i32)
2213 declare void @llvm.arm.neon.vst4lane.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32, i32)
2214 declare void @llvm.arm.neon.vst4lane.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32, i32)
2215 declare void @llvm.arm.neon.vst4lane.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32, i32)
2216 declare void @llvm.arm.neon.vst4lane.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32, i32)
2217 declare void @llvm.arm.neon.vst4lane.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32, i32)
2218 declare void @llvm.arm.neon.vst4lane.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32, i32)
2219 declare void @llvm.arm.neon.vst4lane.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32, i32)
2220 declare void @llvm.arm.neon.vst4lane.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32, i32)
2221 declare void @llvm.arm.neon.vst4lane.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32, i32)
2222 declare void @llvm.arm.neon.vst4lane.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32, i32)
2223 declare void @llvm.arm.neon.vst4lane.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32, i32)
2224
2225 define %struct.int8x16x2_t @test_vld2q_lane_s8(i8* readonly %ptr, [2 x <16 x i8>] %src.coerce) {
2226 ; CHECK-LABEL: test_vld2q_lane_s8
2227 ; CHECK: ld2 { {{v[0-9]+}}.b, {{v[0-9]+}}.b }[15], [x0]
2228 entry:
2229   %src.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %src.coerce, 0
2230   %src.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %src.coerce, 1
2231   %vld2_lane = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %ptr, <16 x i8> %src.coerce.fca.0.extract, <16 x i8> %src.coerce.fca.1.extract, i32 15, i32 1)
2232   %vld2_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 0
2233   %vld2_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 1
2234   %.fca.0.0.insert = insertvalue %struct.int8x16x2_t undef, <16 x i8> %vld2_lane.fca.0.extract, 0, 0
2235   %.fca.0.1.insert = insertvalue %struct.int8x16x2_t %.fca.0.0.insert, <16 x i8> %vld2_lane.fca.1.extract, 0, 1
2236   ret %struct.int8x16x2_t %.fca.0.1.insert
2237 }
2238
2239 define %struct.uint8x16x2_t @test_vld2q_lane_u8(i8* readonly %ptr, [2 x <16 x i8>] %src.coerce) {
2240 ; CHECK-LABEL: test_vld2q_lane_u8
2241 ; CHECK: ld2 { {{v[0-9]+}}.b, {{v[0-9]+}}.b }[15], [x0]
2242 entry:
2243   %src.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %src.coerce, 0
2244   %src.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %src.coerce, 1
2245   %vld2_lane = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %ptr, <16 x i8> %src.coerce.fca.0.extract, <16 x i8> %src.coerce.fca.1.extract, i32 15, i32 1)
2246   %vld2_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 0
2247   %vld2_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 1
2248   %.fca.0.0.insert = insertvalue %struct.uint8x16x2_t undef, <16 x i8> %vld2_lane.fca.0.extract, 0, 0
2249   %.fca.0.1.insert = insertvalue %struct.uint8x16x2_t %.fca.0.0.insert, <16 x i8> %vld2_lane.fca.1.extract, 0, 1
2250   ret %struct.uint8x16x2_t %.fca.0.1.insert
2251 }
2252
2253 define %struct.poly8x16x2_t @test_vld2q_lane_p8(i8* readonly %ptr, [2 x <16 x i8>] %src.coerce) {
2254 ; CHECK-LABEL: test_vld2q_lane_p8
2255 ; CHECK: ld2 { {{v[0-9]+}}.b, {{v[0-9]+}}.b }[15], [x0]
2256 entry:
2257   %src.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %src.coerce, 0
2258   %src.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %src.coerce, 1
2259   %vld2_lane = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2lane.v16i8(i8* %ptr, <16 x i8> %src.coerce.fca.0.extract, <16 x i8> %src.coerce.fca.1.extract, i32 15, i32 1)
2260   %vld2_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 0
2261   %vld2_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2_lane, 1
2262   %.fca.0.0.insert = insertvalue %struct.poly8x16x2_t undef, <16 x i8> %vld2_lane.fca.0.extract, 0, 0
2263   %.fca.0.1.insert = insertvalue %struct.poly8x16x2_t %.fca.0.0.insert, <16 x i8> %vld2_lane.fca.1.extract, 0, 1
2264   ret %struct.poly8x16x2_t %.fca.0.1.insert
2265 }
2266
2267 define %struct.int8x16x3_t @test_vld3q_lane_s8(i8* readonly %ptr, [3 x <16 x i8>] %src.coerce) {
2268 ; CHECK-LABEL: test_vld3q_lane_s8
2269 ; CHECK: ld3 { {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b }[15], [x0]
2270 entry:
2271   %src.coerce.fca.0.extract = extractvalue [3 x <16 x i8>] %src.coerce, 0
2272   %src.coerce.fca.1.extract = extractvalue [3 x <16 x i8>] %src.coerce, 1
2273   %src.coerce.fca.2.extract = extractvalue [3 x <16 x i8>] %src.coerce, 2
2274   %vld3_lane = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8* %ptr, <16 x i8> %src.coerce.fca.0.extract, <16 x i8> %src.coerce.fca.1.extract, <16 x i8> %src.coerce.fca.2.extract, i32 15, i32 1)
2275   %vld3_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 0
2276   %vld3_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 1
2277   %vld3_lane.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 2
2278   %.fca.0.0.insert = insertvalue %struct.int8x16x3_t undef, <16 x i8> %vld3_lane.fca.0.extract, 0, 0
2279   %.fca.0.1.insert = insertvalue %struct.int8x16x3_t %.fca.0.0.insert, <16 x i8> %vld3_lane.fca.1.extract, 0, 1
2280   %.fca.0.2.insert = insertvalue %struct.int8x16x3_t %.fca.0.1.insert, <16 x i8> %vld3_lane.fca.2.extract, 0, 2
2281   ret %struct.int8x16x3_t %.fca.0.2.insert
2282 }
2283
2284 define %struct.uint8x16x3_t @test_vld3q_lane_u8(i8* readonly %ptr, [3 x <16 x i8>] %src.coerce) {
2285 ; CHECK-LABEL: test_vld3q_lane_u8
2286 ; CHECK: ld3 { {{v[0-9]+}}.b, {{v[0-9]+}}.b, {{v[0-9]+}}.b }[15], [x0]
2287 entry:
2288   %src.coerce.fca.0.extract = extractvalue [3 x <16 x i8>] %src.coerce, 0
2289   %src.coerce.fca.1.extract = extractvalue [3 x <16 x i8>] %src.coerce, 1
2290   %src.coerce.fca.2.extract = extractvalue [3 x <16 x i8>] %src.coerce, 2
2291   %vld3_lane = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3lane.v16i8(i8* %ptr, <16 x i8> %src.coerce.fca.0.extract, <16 x i8> %src.coerce.fca.1.extract, <16 x i8> %src.coerce.fca.2.extract, i32 15, i32 1)
2292   %vld3_lane.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 0
2293   %vld3_lane.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 1
2294   %vld3_lane.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3_lane, 2
2295   %.fca.0.0.insert = insertvalue %struct.uint8x16x3_t undef, <16 x i8> %vld3_lane.fca.0.extract, 0, 0
2296   %.fca.0.1.insert = insertvalue %struct.uint8x16x3_t %.fca.0.0.insert, <16 x i8> %vld3_lane.fca.1.extract, 0, 1
2297   %.fca.0.2.insert = insertvalue %struct.uint8x16x3_t %.fca.0.1.insert, <16 x i8> %vld3_lane.fca.2.extract, 0, 2
2298   ret %struct.uint8x16x3_t %.fca.0.2.insert
2299 }
2300