test/CodeGen/AArch64/neon-simd-ldst-multi-elem.ll

   1 ; RUN: llc < %s -verify-machineinstrs -mtriple=aarch64-none-linux-gnu -mattr=+neon | FileCheck %s
   2
   3 %struct.int8x16x2_t = type { [2 x <16 x i8>] }
   4 %struct.int16x8x2_t = type { [2 x <8 x i16>] }
   5 %struct.int32x4x2_t = type { [2 x <4 x i32>] }
   6 %struct.int64x2x2_t = type { [2 x <2 x i64>] }
   7 %struct.float32x4x2_t = type { [2 x <4 x float>] }
   8 %struct.float64x2x2_t = type { [2 x <2 x double>] }
   9 %struct.int8x8x2_t = type { [2 x <8 x i8>] }
  10 %struct.int16x4x2_t = type { [2 x <4 x i16>] }
  11 %struct.int32x2x2_t = type { [2 x <2 x i32>] }
  12 %struct.int64x1x2_t = type { [2 x <1 x i64>] }
  13 %struct.float32x2x2_t = type { [2 x <2 x float>] }
  14 %struct.float64x1x2_t = type { [2 x <1 x double>] }
  15 %struct.int8x16x3_t = type { [3 x <16 x i8>] }
  16 %struct.int16x8x3_t = type { [3 x <8 x i16>] }
  17 %struct.int32x4x3_t = type { [3 x <4 x i32>] }
  18 %struct.int64x2x3_t = type { [3 x <2 x i64>] }
  19 %struct.float32x4x3_t = type { [3 x <4 x float>] }
  20 %struct.float64x2x3_t = type { [3 x <2 x double>] }
  21 %struct.int8x8x3_t = type { [3 x <8 x i8>] }
  22 %struct.int16x4x3_t = type { [3 x <4 x i16>] }
  23 %struct.int32x2x3_t = type { [3 x <2 x i32>] }
  24 %struct.int64x1x3_t = type { [3 x <1 x i64>] }
  25 %struct.float32x2x3_t = type { [3 x <2 x float>] }
  26 %struct.float64x1x3_t = type { [3 x <1 x double>] }
  27 %struct.int8x16x4_t = type { [4 x <16 x i8>] }
  28 %struct.int16x8x4_t = type { [4 x <8 x i16>] }
  29 %struct.int32x4x4_t = type { [4 x <4 x i32>] }
  30 %struct.int64x2x4_t = type { [4 x <2 x i64>] }
  31 %struct.float32x4x4_t = type { [4 x <4 x float>] }
  32 %struct.float64x2x4_t = type { [4 x <2 x double>] }
  33 %struct.int8x8x4_t = type { [4 x <8 x i8>] }
  34 %struct.int16x4x4_t = type { [4 x <4 x i16>] }
  35 %struct.int32x2x4_t = type { [4 x <2 x i32>] }
  36 %struct.int64x1x4_t = type { [4 x <1 x i64>] }
  37 %struct.float32x2x4_t = type { [4 x <2 x float>] }
  38 %struct.float64x1x4_t = type { [4 x <1 x double>] }
  39
  40
  41 define <16 x i8> @test_vld1q_s8(i8* readonly %a) {
  42 ; CHECK: test_vld1q_s8
  43 ; CHECK: ld1 {v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}]
  44   %vld1 = tail call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %a, i32 1)
  45   ret <16 x i8> %vld1
  46 }
  47
  48 define <8 x i16> @test_vld1q_s16(i16* readonly %a) {
  49 ; CHECK: test_vld1q_s16
  50 ; CHECK: ld1 {v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}]
  51   %1 = bitcast i16* %a to i8*
  52   %vld1 = tail call <8 x i16> @llvm.arm.neon.vld1.v8i16(i8* %1, i32 2)
  53   ret <8 x i16> %vld1
  54 }
  55
  56 define <4 x i32> @test_vld1q_s32(i32* readonly %a) {
  57 ; CHECK: test_vld1q_s32
  58 ; CHECK: ld1 {v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
  59   %1 = bitcast i32* %a to i8*
  60   %vld1 = tail call <4 x i32> @llvm.arm.neon.vld1.v4i32(i8* %1, i32 4)
  61   ret <4 x i32> %vld1
  62 }
  63
  64 define <2 x i64> @test_vld1q_s64(i64* readonly %a) {
  65 ; CHECK: test_vld1q_s64
  66 ; CHECK: ld1 {v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
  67   %1 = bitcast i64* %a to i8*
  68   %vld1 = tail call <2 x i64> @llvm.arm.neon.vld1.v2i64(i8* %1, i32 8)
  69   ret <2 x i64> %vld1
  70 }
  71
  72 define <4 x float> @test_vld1q_f32(float* readonly %a) {
  73 ; CHECK: test_vld1q_f32
  74 ; CHECK: ld1 {v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
  75   %1 = bitcast float* %a to i8*
  76   %vld1 = tail call <4 x float> @llvm.arm.neon.vld1.v4f32(i8* %1, i32 4)
  77   ret <4 x float> %vld1
  78 }
  79
  80 define <2 x double> @test_vld1q_f64(double* readonly %a) {
  81 ; CHECK: test_vld1q_f64
  82 ; CHECK: ld1 {v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
  83   %1 = bitcast double* %a to i8*
  84   %vld1 = tail call <2 x double> @llvm.arm.neon.vld1.v2f64(i8* %1, i32 8)
  85   ret <2 x double> %vld1
  86 }
  87
  88 define <8 x i8> @test_vld1_s8(i8* readonly %a) {
  89 ; CHECK: test_vld1_s8
  90 ; CHECK: ld1 {v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
  91   %vld1 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %a, i32 1)
  92   ret <8 x i8> %vld1
  93 }
  94
  95 define <4 x i16> @test_vld1_s16(i16* readonly %a) {
  96 ; CHECK: test_vld1_s16
  97 ; CHECK: ld1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
  98   %1 = bitcast i16* %a to i8*
  99   %vld1 = tail call <4 x i16> @llvm.arm.neon.vld1.v4i16(i8* %1, i32 2)
 100   ret <4 x i16> %vld1
 101 }
 102
 103 define <2 x i32> @test_vld1_s32(i32* readonly %a) {
 104 ; CHECK: test_vld1_s32
 105 ; CHECK: ld1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
 106   %1 = bitcast i32* %a to i8*
 107   %vld1 = tail call <2 x i32> @llvm.arm.neon.vld1.v2i32(i8* %1, i32 4)
 108   ret <2 x i32> %vld1
 109 }
 110
 111 define <1 x i64> @test_vld1_s64(i64* readonly %a) {
 112 ; CHECK: test_vld1_s64
 113 ; CHECK: ld1 {v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
 114   %1 = bitcast i64* %a to i8*
 115   %vld1 = tail call <1 x i64> @llvm.arm.neon.vld1.v1i64(i8* %1, i32 8)
 116   ret <1 x i64> %vld1
 117 }
 118
 119 define <2 x float> @test_vld1_f32(float* readonly %a) {
 120 ; CHECK: test_vld1_f32
 121 ; CHECK: ld1 {v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
 122   %1 = bitcast float* %a to i8*
 123   %vld1 = tail call <2 x float> @llvm.arm.neon.vld1.v2f32(i8* %1, i32 4)
 124   ret <2 x float> %vld1
 125 }
 126
 127 define <1 x double> @test_vld1_f64(double* readonly %a) {
 128 ; CHECK: test_vld1_f64
 129 ; CHECK: ld1 {v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
 130   %1 = bitcast double* %a to i8*
 131   %vld1 = tail call <1 x double> @llvm.arm.neon.vld1.v1f64(i8* %1, i32 8)
 132   ret <1 x double> %vld1
 133 }
 134
 135 define <8 x i8> @test_vld1_p8(i8* readonly %a) {
 136 ; CHECK: test_vld1_p8
 137 ; CHECK: ld1 {v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
 138   %vld1 = tail call <8 x i8> @llvm.arm.neon.vld1.v8i8(i8* %a, i32 1)
 139   ret <8 x i8> %vld1
 140 }
 141
 142 define <4 x i16> @test_vld1_p16(i16* readonly %a) {
 143 ; CHECK: test_vld1_p16
 144 ; CHECK: ld1 {v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
 145   %1 = bitcast i16* %a to i8*
 146   %vld1 = tail call <4 x i16> @llvm.arm.neon.vld1.v4i16(i8* %1, i32 2)
 147   ret <4 x i16> %vld1
 148 }
 149
 150 define %struct.int8x16x2_t @test_vld2q_s8(i8* readonly %a) {
 151 ; CHECK: test_vld2q_s8
 152 ; CHECK: ld2 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}]
 153   %vld2 = tail call { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8(i8* %a, i32 1)
 154   %vld2.fca.0.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2, 0
 155   %vld2.fca.1.extract = extractvalue { <16 x i8>, <16 x i8> } %vld2, 1
 156   %.fca.0.0.insert = insertvalue %struct.int8x16x2_t undef, <16 x i8> %vld2.fca.0.extract, 0, 0
 157   %.fca.0.1.insert = insertvalue %struct.int8x16x2_t %.fca.0.0.insert, <16 x i8> %vld2.fca.1.extract, 0, 1
 158   ret %struct.int8x16x2_t %.fca.0.1.insert
 159 }
 160
 161 define %struct.int16x8x2_t @test_vld2q_s16(i16* readonly %a) {
 162 ; CHECK: test_vld2q_s16
 163 ; CHECK: ld2 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}]
 164   %1 = bitcast i16* %a to i8*
 165   %vld2 = tail call { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16(i8* %1, i32 2)
 166   %vld2.fca.0.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2, 0
 167   %vld2.fca.1.extract = extractvalue { <8 x i16>, <8 x i16> } %vld2, 1
 168   %.fca.0.0.insert = insertvalue %struct.int16x8x2_t undef, <8 x i16> %vld2.fca.0.extract, 0, 0
 169   %.fca.0.1.insert = insertvalue %struct.int16x8x2_t %.fca.0.0.insert, <8 x i16> %vld2.fca.1.extract, 0, 1
 170   ret %struct.int16x8x2_t %.fca.0.1.insert
 171 }
 172
 173 define %struct.int32x4x2_t @test_vld2q_s32(i32* readonly %a) {
 174 ; CHECK: test_vld2q_s32
 175 ; CHECK: ld2 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
 176   %1 = bitcast i32* %a to i8*
 177   %vld2 = tail call { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32(i8* %1, i32 4)
 178   %vld2.fca.0.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 0
 179   %vld2.fca.1.extract = extractvalue { <4 x i32>, <4 x i32> } %vld2, 1
 180   %.fca.0.0.insert = insertvalue %struct.int32x4x2_t undef, <4 x i32> %vld2.fca.0.extract, 0, 0
 181   %.fca.0.1.insert = insertvalue %struct.int32x4x2_t %.fca.0.0.insert, <4 x i32> %vld2.fca.1.extract, 0, 1
 182   ret %struct.int32x4x2_t %.fca.0.1.insert
 183 }
 184
 185 define %struct.int64x2x2_t @test_vld2q_s64(i64* readonly %a) {
 186 ; CHECK: test_vld2q_s64
 187 ; CHECK: ld2 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
 188   %1 = bitcast i64* %a to i8*
 189   %vld2 = tail call { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2.v2i64(i8* %1, i32 8)
 190   %vld2.fca.0.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2, 0
 191   %vld2.fca.1.extract = extractvalue { <2 x i64>, <2 x i64> } %vld2, 1
 192   %.fca.0.0.insert = insertvalue %struct.int64x2x2_t undef, <2 x i64> %vld2.fca.0.extract, 0, 0
 193   %.fca.0.1.insert = insertvalue %struct.int64x2x2_t %.fca.0.0.insert, <2 x i64> %vld2.fca.1.extract, 0, 1
 194   ret %struct.int64x2x2_t %.fca.0.1.insert
 195 }
 196
 197 define %struct.float32x4x2_t @test_vld2q_f32(float* readonly %a) {
 198 ; CHECK: test_vld2q_f32
 199 ; CHECK: ld2 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
 200   %1 = bitcast float* %a to i8*
 201   %vld2 = tail call { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8* %1, i32 4)
 202   %vld2.fca.0.extract = extractvalue { <4 x float>, <4 x float> } %vld2, 0
 203   %vld2.fca.1.extract = extractvalue { <4 x float>, <4 x float> } %vld2, 1
 204   %.fca.0.0.insert = insertvalue %struct.float32x4x2_t undef, <4 x float> %vld2.fca.0.extract, 0, 0
 205   %.fca.0.1.insert = insertvalue %struct.float32x4x2_t %.fca.0.0.insert, <4 x float> %vld2.fca.1.extract, 0, 1
 206   ret %struct.float32x4x2_t %.fca.0.1.insert
 207 }
 208
 209 define %struct.float64x2x2_t @test_vld2q_f64(double* readonly %a) {
 210 ; CHECK: test_vld2q_f64
 211 ; CHECK: ld2 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
 212   %1 = bitcast double* %a to i8*
 213   %vld2 = tail call { <2 x double>, <2 x double> } @llvm.arm.neon.vld2.v2f64(i8* %1, i32 8)
 214   %vld2.fca.0.extract = extractvalue { <2 x double>, <2 x double> } %vld2, 0
 215   %vld2.fca.1.extract = extractvalue { <2 x double>, <2 x double> } %vld2, 1
 216   %.fca.0.0.insert = insertvalue %struct.float64x2x2_t undef, <2 x double> %vld2.fca.0.extract, 0, 0
 217   %.fca.0.1.insert = insertvalue %struct.float64x2x2_t %.fca.0.0.insert, <2 x double> %vld2.fca.1.extract, 0, 1
 218   ret %struct.float64x2x2_t %.fca.0.1.insert
 219 }
 220
 221 define %struct.int8x8x2_t @test_vld2_s8(i8* readonly %a) {
 222 ; CHECK: test_vld2_s8
 223 ; CHECK: ld2 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
 224   %vld2 = tail call { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8(i8* %a, i32 1)
 225   %vld2.fca.0.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2, 0
 226   %vld2.fca.1.extract = extractvalue { <8 x i8>, <8 x i8> } %vld2, 1
 227   %.fca.0.0.insert = insertvalue %struct.int8x8x2_t undef, <8 x i8> %vld2.fca.0.extract, 0, 0
 228   %.fca.0.1.insert = insertvalue %struct.int8x8x2_t %.fca.0.0.insert, <8 x i8> %vld2.fca.1.extract, 0, 1
 229   ret %struct.int8x8x2_t %.fca.0.1.insert
 230 }
 231
 232 define %struct.int16x4x2_t @test_vld2_s16(i16* readonly %a) {
 233 ; CHECK: test_vld2_s16
 234 ; CHECK: ld2 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
 235   %1 = bitcast i16* %a to i8*
 236   %vld2 = tail call { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16(i8* %1, i32 2)
 237   %vld2.fca.0.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2, 0
 238   %vld2.fca.1.extract = extractvalue { <4 x i16>, <4 x i16> } %vld2, 1
 239   %.fca.0.0.insert = insertvalue %struct.int16x4x2_t undef, <4 x i16> %vld2.fca.0.extract, 0, 0
 240   %.fca.0.1.insert = insertvalue %struct.int16x4x2_t %.fca.0.0.insert, <4 x i16> %vld2.fca.1.extract, 0, 1
 241   ret %struct.int16x4x2_t %.fca.0.1.insert
 242 }
 243
 244 define %struct.int32x2x2_t @test_vld2_s32(i32* readonly %a) {
 245 ; CHECK: test_vld2_s32
 246 ; CHECK: ld2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
 247   %1 = bitcast i32* %a to i8*
 248   %vld2 = tail call { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32(i8* %1, i32 4)
 249   %vld2.fca.0.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2, 0
 250   %vld2.fca.1.extract = extractvalue { <2 x i32>, <2 x i32> } %vld2, 1
 251   %.fca.0.0.insert = insertvalue %struct.int32x2x2_t undef, <2 x i32> %vld2.fca.0.extract, 0, 0
 252   %.fca.0.1.insert = insertvalue %struct.int32x2x2_t %.fca.0.0.insert, <2 x i32> %vld2.fca.1.extract, 0, 1
 253   ret %struct.int32x2x2_t %.fca.0.1.insert
 254 }
 255
 256 define %struct.int64x1x2_t @test_vld2_s64(i64* readonly %a) {
 257 ; CHECK: test_vld2_s64
 258 ; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
 259   %1 = bitcast i64* %a to i8*
 260   %vld2 = tail call { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8* %1, i32 8)
 261   %vld2.fca.0.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2, 0
 262   %vld2.fca.1.extract = extractvalue { <1 x i64>, <1 x i64> } %vld2, 1
 263   %.fca.0.0.insert = insertvalue %struct.int64x1x2_t undef, <1 x i64> %vld2.fca.0.extract, 0, 0
 264   %.fca.0.1.insert = insertvalue %struct.int64x1x2_t %.fca.0.0.insert, <1 x i64> %vld2.fca.1.extract, 0, 1
 265   ret %struct.int64x1x2_t %.fca.0.1.insert
 266 }
 267
 268 define %struct.float32x2x2_t @test_vld2_f32(float* readonly %a) {
 269 ; CHECK: test_vld2_f32
 270 ; CHECK: ld2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
 271   %1 = bitcast float* %a to i8*
 272   %vld2 = tail call { <2 x float>, <2 x float> } @llvm.arm.neon.vld2.v2f32(i8* %1, i32 4)
 273   %vld2.fca.0.extract = extractvalue { <2 x float>, <2 x float> } %vld2, 0
 274   %vld2.fca.1.extract = extractvalue { <2 x float>, <2 x float> } %vld2, 1
 275   %.fca.0.0.insert = insertvalue %struct.float32x2x2_t undef, <2 x float> %vld2.fca.0.extract, 0, 0
 276   %.fca.0.1.insert = insertvalue %struct.float32x2x2_t %.fca.0.0.insert, <2 x float> %vld2.fca.1.extract, 0, 1
 277   ret %struct.float32x2x2_t %.fca.0.1.insert
 278 }
 279
 280 define %struct.float64x1x2_t @test_vld2_f64(double* readonly %a) {
 281 ; CHECK: test_vld2_f64
 282 ; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
 283   %1 = bitcast double* %a to i8*
 284   %vld2 = tail call { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8* %1, i32 8)
 285   %vld2.fca.0.extract = extractvalue { <1 x double>, <1 x double> } %vld2, 0
 286   %vld2.fca.1.extract = extractvalue { <1 x double>, <1 x double> } %vld2, 1
 287   %.fca.0.0.insert = insertvalue %struct.float64x1x2_t undef, <1 x double> %vld2.fca.0.extract, 0, 0
 288   %.fca.0.1.insert = insertvalue %struct.float64x1x2_t %.fca.0.0.insert, <1 x double> %vld2.fca.1.extract, 0, 1
 289   ret %struct.float64x1x2_t %.fca.0.1.insert
 290 }
 291
 292 define %struct.int8x16x3_t @test_vld3q_s8(i8* readonly %a) {
 293 ; CHECK: test_vld3q_s8
 294 ; CHECK: ld3 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}]
 295   %vld3 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* %a, i32 1)
 296   %vld3.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 0
 297   %vld3.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 1
 298   %vld3.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %vld3, 2
 299   %.fca.0.0.insert = insertvalue %struct.int8x16x3_t undef, <16 x i8> %vld3.fca.0.extract, 0, 0
 300   %.fca.0.1.insert = insertvalue %struct.int8x16x3_t %.fca.0.0.insert, <16 x i8> %vld3.fca.1.extract, 0, 1
 301   %.fca.0.2.insert = insertvalue %struct.int8x16x3_t %.fca.0.1.insert, <16 x i8> %vld3.fca.2.extract, 0, 2
 302   ret %struct.int8x16x3_t %.fca.0.2.insert
 303 }
 304
 305 define %struct.int16x8x3_t @test_vld3q_s16(i16* readonly %a) {
 306 ; CHECK: test_vld3q_s16
 307 ; CHECK: ld3 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}]
 308   %1 = bitcast i16* %a to i8*
 309   %vld3 = tail call { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16(i8* %1, i32 2)
 310   %vld3.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3, 0
 311   %vld3.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3, 1
 312   %vld3.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16> } %vld3, 2
 313   %.fca.0.0.insert = insertvalue %struct.int16x8x3_t undef, <8 x i16> %vld3.fca.0.extract, 0, 0
 314   %.fca.0.1.insert = insertvalue %struct.int16x8x3_t %.fca.0.0.insert, <8 x i16> %vld3.fca.1.extract, 0, 1
 315   %.fca.0.2.insert = insertvalue %struct.int16x8x3_t %.fca.0.1.insert, <8 x i16> %vld3.fca.2.extract, 0, 2
 316   ret %struct.int16x8x3_t %.fca.0.2.insert
 317 }
 318
 319 define %struct.int32x4x3_t @test_vld3q_s32(i32* readonly %a) {
 320 ; CHECK: test_vld3q_s32
 321 ; CHECK: ld3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
 322   %1 = bitcast i32* %a to i8*
 323   %vld3 = tail call { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32(i8* %1, i32 4)
 324   %vld3.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 0
 325   %vld3.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 1
 326   %vld3.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32> } %vld3, 2
 327   %.fca.0.0.insert = insertvalue %struct.int32x4x3_t undef, <4 x i32> %vld3.fca.0.extract, 0, 0
 328   %.fca.0.1.insert = insertvalue %struct.int32x4x3_t %.fca.0.0.insert, <4 x i32> %vld3.fca.1.extract, 0, 1
 329   %.fca.0.2.insert = insertvalue %struct.int32x4x3_t %.fca.0.1.insert, <4 x i32> %vld3.fca.2.extract, 0, 2
 330   ret %struct.int32x4x3_t %.fca.0.2.insert
 331 }
 332
 333 define %struct.int64x2x3_t @test_vld3q_s64(i64* readonly %a) {
 334 ; CHECK: test_vld3q_s64
 335 ; CHECK: ld3 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
 336   %1 = bitcast i64* %a to i8*
 337   %vld3 = tail call { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3.v2i64(i8* %1, i32 8)
 338   %vld3.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3, 0
 339   %vld3.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3, 1
 340   %vld3.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64> } %vld3, 2
 341   %.fca.0.0.insert = insertvalue %struct.int64x2x3_t undef, <2 x i64> %vld3.fca.0.extract, 0, 0
 342   %.fca.0.1.insert = insertvalue %struct.int64x2x3_t %.fca.0.0.insert, <2 x i64> %vld3.fca.1.extract, 0, 1
 343   %.fca.0.2.insert = insertvalue %struct.int64x2x3_t %.fca.0.1.insert, <2 x i64> %vld3.fca.2.extract, 0, 2
 344   ret %struct.int64x2x3_t %.fca.0.2.insert
 345 }
 346
 347 define %struct.float32x4x3_t @test_vld3q_f32(float* readonly %a) {
 348 ; CHECK: test_vld3q_f32
 349 ; CHECK: ld3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
 350   %1 = bitcast float* %a to i8*
 351   %vld3 = tail call { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3.v4f32(i8* %1, i32 4)
 352   %vld3.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3, 0
 353   %vld3.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3, 1
 354   %vld3.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float> } %vld3, 2
 355   %.fca.0.0.insert = insertvalue %struct.float32x4x3_t undef, <4 x float> %vld3.fca.0.extract, 0, 0
 356   %.fca.0.1.insert = insertvalue %struct.float32x4x3_t %.fca.0.0.insert, <4 x float> %vld3.fca.1.extract, 0, 1
 357   %.fca.0.2.insert = insertvalue %struct.float32x4x3_t %.fca.0.1.insert, <4 x float> %vld3.fca.2.extract, 0, 2
 358   ret %struct.float32x4x3_t %.fca.0.2.insert
 359 }
 360
 361 define %struct.float64x2x3_t @test_vld3q_f64(double* readonly %a) {
 362 ; CHECK: test_vld3q_f64
 363 ; CHECK: ld3 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
 364   %1 = bitcast double* %a to i8*
 365   %vld3 = tail call { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3.v2f64(i8* %1, i32 8)
 366   %vld3.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3, 0
 367   %vld3.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3, 1
 368   %vld3.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double> } %vld3, 2
 369   %.fca.0.0.insert = insertvalue %struct.float64x2x3_t undef, <2 x double> %vld3.fca.0.extract, 0, 0
 370   %.fca.0.1.insert = insertvalue %struct.float64x2x3_t %.fca.0.0.insert, <2 x double> %vld3.fca.1.extract, 0, 1
 371   %.fca.0.2.insert = insertvalue %struct.float64x2x3_t %.fca.0.1.insert, <2 x double> %vld3.fca.2.extract, 0, 2
 372   ret %struct.float64x2x3_t %.fca.0.2.insert
 373 }
 374
 375 define %struct.int8x8x3_t @test_vld3_s8(i8* readonly %a) {
 376 ; CHECK: test_vld3_s8
 377 ; CHECK: ld3 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
 378   %vld3 = tail call { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8(i8* %a, i32 1)
 379   %vld3.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 0
 380   %vld3.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 1
 381   %vld3.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8> } %vld3, 2
 382   %.fca.0.0.insert = insertvalue %struct.int8x8x3_t undef, <8 x i8> %vld3.fca.0.extract, 0, 0
 383   %.fca.0.1.insert = insertvalue %struct.int8x8x3_t %.fca.0.0.insert, <8 x i8> %vld3.fca.1.extract, 0, 1
 384   %.fca.0.2.insert = insertvalue %struct.int8x8x3_t %.fca.0.1.insert, <8 x i8> %vld3.fca.2.extract, 0, 2
 385   ret %struct.int8x8x3_t %.fca.0.2.insert
 386 }
 387
 388 define %struct.int16x4x3_t @test_vld3_s16(i16* readonly %a) {
 389 ; CHECK: test_vld3_s16
 390 ; CHECK: ld3 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
 391   %1 = bitcast i16* %a to i8*
 392   %vld3 = tail call { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16(i8* %1, i32 2)
 393   %vld3.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3, 0
 394   %vld3.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3, 1
 395   %vld3.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16> } %vld3, 2
 396   %.fca.0.0.insert = insertvalue %struct.int16x4x3_t undef, <4 x i16> %vld3.fca.0.extract, 0, 0
 397   %.fca.0.1.insert = insertvalue %struct.int16x4x3_t %.fca.0.0.insert, <4 x i16> %vld3.fca.1.extract, 0, 1
 398   %.fca.0.2.insert = insertvalue %struct.int16x4x3_t %.fca.0.1.insert, <4 x i16> %vld3.fca.2.extract, 0, 2
 399   ret %struct.int16x4x3_t %.fca.0.2.insert
 400 }
 401
 402 define %struct.int32x2x3_t @test_vld3_s32(i32* readonly %a) {
 403 ; CHECK: test_vld3_s32
 404 ; CHECK: ld3 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
 405   %1 = bitcast i32* %a to i8*
 406   %vld3 = tail call { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32(i8* %1, i32 4)
 407   %vld3.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3, 0
 408   %vld3.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3, 1
 409   %vld3.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32> } %vld3, 2
 410   %.fca.0.0.insert = insertvalue %struct.int32x2x3_t undef, <2 x i32> %vld3.fca.0.extract, 0, 0
 411   %.fca.0.1.insert = insertvalue %struct.int32x2x3_t %.fca.0.0.insert, <2 x i32> %vld3.fca.1.extract, 0, 1
 412   %.fca.0.2.insert = insertvalue %struct.int32x2x3_t %.fca.0.1.insert, <2 x i32> %vld3.fca.2.extract, 0, 2
 413   ret %struct.int32x2x3_t %.fca.0.2.insert
 414 }
 415
 416 define %struct.int64x1x3_t @test_vld3_s64(i64* readonly %a) {
 417 ; CHECK: test_vld3_s64
 418 ; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
 419   %1 = bitcast i64* %a to i8*
 420   %vld3 = tail call { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8* %1, i32 8)
 421   %vld3.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3, 0
 422   %vld3.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3, 1
 423   %vld3.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64> } %vld3, 2
 424   %.fca.0.0.insert = insertvalue %struct.int64x1x3_t undef, <1 x i64> %vld3.fca.0.extract, 0, 0
 425   %.fca.0.1.insert = insertvalue %struct.int64x1x3_t %.fca.0.0.insert, <1 x i64> %vld3.fca.1.extract, 0, 1
 426   %.fca.0.2.insert = insertvalue %struct.int64x1x3_t %.fca.0.1.insert, <1 x i64> %vld3.fca.2.extract, 0, 2
 427   ret %struct.int64x1x3_t %.fca.0.2.insert
 428 }
 429
 430 define %struct.float32x2x3_t @test_vld3_f32(float* readonly %a) {
 431 ; CHECK: test_vld3_f32
 432 ; CHECK: ld3 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
 433   %1 = bitcast float* %a to i8*
 434   %vld3 = tail call { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3.v2f32(i8* %1, i32 4)
 435   %vld3.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3, 0
 436   %vld3.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3, 1
 437   %vld3.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float> } %vld3, 2
 438   %.fca.0.0.insert = insertvalue %struct.float32x2x3_t undef, <2 x float> %vld3.fca.0.extract, 0, 0
 439   %.fca.0.1.insert = insertvalue %struct.float32x2x3_t %.fca.0.0.insert, <2 x float> %vld3.fca.1.extract, 0, 1
 440   %.fca.0.2.insert = insertvalue %struct.float32x2x3_t %.fca.0.1.insert, <2 x float> %vld3.fca.2.extract, 0, 2
 441   ret %struct.float32x2x3_t %.fca.0.2.insert
 442 }
 443
 444 define %struct.float64x1x3_t @test_vld3_f64(double* readonly %a) {
 445 ; CHECK: test_vld3_f64
 446 ; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
 447   %1 = bitcast double* %a to i8*
 448   %vld3 = tail call { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8* %1, i32 8)
 449   %vld3.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3, 0
 450   %vld3.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3, 1
 451   %vld3.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double> } %vld3, 2
 452   %.fca.0.0.insert = insertvalue %struct.float64x1x3_t undef, <1 x double> %vld3.fca.0.extract, 0, 0
 453   %.fca.0.1.insert = insertvalue %struct.float64x1x3_t %.fca.0.0.insert, <1 x double> %vld3.fca.1.extract, 0, 1
 454   %.fca.0.2.insert = insertvalue %struct.float64x1x3_t %.fca.0.1.insert, <1 x double> %vld3.fca.2.extract, 0, 2
 455   ret %struct.float64x1x3_t %.fca.0.2.insert
 456 }
 457
 458 define %struct.int8x16x4_t @test_vld4q_s8(i8* readonly %a) {
 459 ; CHECK: test_vld4q_s8
 460 ; CHECK: ld4 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [x{{[0-9]+|sp}}]
 461   %vld4 = tail call { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8(i8* %a, i32 1)
 462   %vld4.fca.0.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 0
 463   %vld4.fca.1.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 1
 464   %vld4.fca.2.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 2
 465   %vld4.fca.3.extract = extractvalue { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } %vld4, 3
 466   %.fca.0.0.insert = insertvalue %struct.int8x16x4_t undef, <16 x i8> %vld4.fca.0.extract, 0, 0
 467   %.fca.0.1.insert = insertvalue %struct.int8x16x4_t %.fca.0.0.insert, <16 x i8> %vld4.fca.1.extract, 0, 1
 468   %.fca.0.2.insert = insertvalue %struct.int8x16x4_t %.fca.0.1.insert, <16 x i8> %vld4.fca.2.extract, 0, 2
 469   %.fca.0.3.insert = insertvalue %struct.int8x16x4_t %.fca.0.2.insert, <16 x i8> %vld4.fca.3.extract, 0, 3
 470   ret %struct.int8x16x4_t %.fca.0.3.insert
 471 }
 472
 473 define %struct.int16x8x4_t @test_vld4q_s16(i16* readonly %a) {
 474 ; CHECK: test_vld4q_s16
 475 ; CHECK: ld4 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [x{{[0-9]+|sp}}]
 476   %1 = bitcast i16* %a to i8*
 477   %vld4 = tail call { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16(i8* %1, i32 2)
 478   %vld4.fca.0.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4, 0
 479   %vld4.fca.1.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4, 1
 480   %vld4.fca.2.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4, 2
 481   %vld4.fca.3.extract = extractvalue { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } %vld4, 3
 482   %.fca.0.0.insert = insertvalue %struct.int16x8x4_t undef, <8 x i16> %vld4.fca.0.extract, 0, 0
 483   %.fca.0.1.insert = insertvalue %struct.int16x8x4_t %.fca.0.0.insert, <8 x i16> %vld4.fca.1.extract, 0, 1
 484   %.fca.0.2.insert = insertvalue %struct.int16x8x4_t %.fca.0.1.insert, <8 x i16> %vld4.fca.2.extract, 0, 2
 485   %.fca.0.3.insert = insertvalue %struct.int16x8x4_t %.fca.0.2.insert, <8 x i16> %vld4.fca.3.extract, 0, 3
 486   ret %struct.int16x8x4_t %.fca.0.3.insert
 487 }
 488
 489 define %struct.int32x4x4_t @test_vld4q_s32(i32* readonly %a) {
 490 ; CHECK: test_vld4q_s32
 491 ; CHECK: ld4 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
 492   %1 = bitcast i32* %a to i8*
 493   %vld4 = tail call { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32(i8* %1, i32 4)
 494   %vld4.fca.0.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4, 0
 495   %vld4.fca.1.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4, 1
 496   %vld4.fca.2.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4, 2
 497   %vld4.fca.3.extract = extractvalue { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } %vld4, 3
 498   %.fca.0.0.insert = insertvalue %struct.int32x4x4_t undef, <4 x i32> %vld4.fca.0.extract, 0, 0
 499   %.fca.0.1.insert = insertvalue %struct.int32x4x4_t %.fca.0.0.insert, <4 x i32> %vld4.fca.1.extract, 0, 1
 500   %.fca.0.2.insert = insertvalue %struct.int32x4x4_t %.fca.0.1.insert, <4 x i32> %vld4.fca.2.extract, 0, 2
 501   %.fca.0.3.insert = insertvalue %struct.int32x4x4_t %.fca.0.2.insert, <4 x i32> %vld4.fca.3.extract, 0, 3
 502   ret %struct.int32x4x4_t %.fca.0.3.insert
 503 }
 504
 505 define %struct.int64x2x4_t @test_vld4q_s64(i64* readonly %a) {
 506 ; CHECK: test_vld4q_s64
 507 ; CHECK: ld4 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
 508   %1 = bitcast i64* %a to i8*
 509   %vld4 = tail call { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4.v2i64(i8* %1, i32 8)
 510   %vld4.fca.0.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld4, 0
 511   %vld4.fca.1.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld4, 1
 512   %vld4.fca.2.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld4, 2
 513   %vld4.fca.3.extract = extractvalue { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } %vld4, 3
 514   %.fca.0.0.insert = insertvalue %struct.int64x2x4_t undef, <2 x i64> %vld4.fca.0.extract, 0, 0
 515   %.fca.0.1.insert = insertvalue %struct.int64x2x4_t %.fca.0.0.insert, <2 x i64> %vld4.fca.1.extract, 0, 1
 516   %.fca.0.2.insert = insertvalue %struct.int64x2x4_t %.fca.0.1.insert, <2 x i64> %vld4.fca.2.extract, 0, 2
 517   %.fca.0.3.insert = insertvalue %struct.int64x2x4_t %.fca.0.2.insert, <2 x i64> %vld4.fca.3.extract, 0, 3
 518   ret %struct.int64x2x4_t %.fca.0.3.insert
 519 }
 520
 521 define %struct.float32x4x4_t @test_vld4q_f32(float* readonly %a) {
 522 ; CHECK: test_vld4q_f32
 523 ; CHECK: ld4 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [x{{[0-9]+|sp}}]
 524   %1 = bitcast float* %a to i8*
 525   %vld4 = tail call { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32(i8* %1, i32 4)
 526   %vld4.fca.0.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4, 0
 527   %vld4.fca.1.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4, 1
 528   %vld4.fca.2.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4, 2
 529   %vld4.fca.3.extract = extractvalue { <4 x float>, <4 x float>, <4 x float>, <4 x float> } %vld4, 3
 530   %.fca.0.0.insert = insertvalue %struct.float32x4x4_t undef, <4 x float> %vld4.fca.0.extract, 0, 0
 531   %.fca.0.1.insert = insertvalue %struct.float32x4x4_t %.fca.0.0.insert, <4 x float> %vld4.fca.1.extract, 0, 1
 532   %.fca.0.2.insert = insertvalue %struct.float32x4x4_t %.fca.0.1.insert, <4 x float> %vld4.fca.2.extract, 0, 2
 533   %.fca.0.3.insert = insertvalue %struct.float32x4x4_t %.fca.0.2.insert, <4 x float> %vld4.fca.3.extract, 0, 3
 534   ret %struct.float32x4x4_t %.fca.0.3.insert
 535 }
 536
 537 define %struct.float64x2x4_t @test_vld4q_f64(double* readonly %a) {
 538 ; CHECK: test_vld4q_f64
 539 ; CHECK: ld4 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [x{{[0-9]+|sp}}]
 540   %1 = bitcast double* %a to i8*
 541   %vld4 = tail call { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4.v2f64(i8* %1, i32 8)
 542   %vld4.fca.0.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld4, 0
 543   %vld4.fca.1.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld4, 1
 544   %vld4.fca.2.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld4, 2
 545   %vld4.fca.3.extract = extractvalue { <2 x double>, <2 x double>, <2 x double>, <2 x double> } %vld4, 3
 546   %.fca.0.0.insert = insertvalue %struct.float64x2x4_t undef, <2 x double> %vld4.fca.0.extract, 0, 0
 547   %.fca.0.1.insert = insertvalue %struct.float64x2x4_t %.fca.0.0.insert, <2 x double> %vld4.fca.1.extract, 0, 1
 548   %.fca.0.2.insert = insertvalue %struct.float64x2x4_t %.fca.0.1.insert, <2 x double> %vld4.fca.2.extract, 0, 2
 549   %.fca.0.3.insert = insertvalue %struct.float64x2x4_t %.fca.0.2.insert, <2 x double> %vld4.fca.3.extract, 0, 3
 550   ret %struct.float64x2x4_t %.fca.0.3.insert
 551 }
 552
 553 define %struct.int8x8x4_t @test_vld4_s8(i8* readonly %a) {
 554 ; CHECK: test_vld4_s8
 555 ; CHECK: ld4 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [x{{[0-9]+|sp}}]
 556   %vld4 = tail call { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8(i8* %a, i32 1)
 557   %vld4.fca.0.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 0
 558   %vld4.fca.1.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 1
 559   %vld4.fca.2.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 2
 560   %vld4.fca.3.extract = extractvalue { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } %vld4, 3
 561   %.fca.0.0.insert = insertvalue %struct.int8x8x4_t undef, <8 x i8> %vld4.fca.0.extract, 0, 0
 562   %.fca.0.1.insert = insertvalue %struct.int8x8x4_t %.fca.0.0.insert, <8 x i8> %vld4.fca.1.extract, 0, 1
 563   %.fca.0.2.insert = insertvalue %struct.int8x8x4_t %.fca.0.1.insert, <8 x i8> %vld4.fca.2.extract, 0, 2
 564   %.fca.0.3.insert = insertvalue %struct.int8x8x4_t %.fca.0.2.insert, <8 x i8> %vld4.fca.3.extract, 0, 3
 565   ret %struct.int8x8x4_t %.fca.0.3.insert
 566 }
 567
 568 define %struct.int16x4x4_t @test_vld4_s16(i16* readonly %a) {
 569 ; CHECK: test_vld4_s16
 570 ; CHECK: ld4 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [x{{[0-9]+|sp}}]
 571   %1 = bitcast i16* %a to i8*
 572   %vld4 = tail call { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16(i8* %1, i32 2)
 573   %vld4.fca.0.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4, 0
 574   %vld4.fca.1.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4, 1
 575   %vld4.fca.2.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4, 2
 576   %vld4.fca.3.extract = extractvalue { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } %vld4, 3
 577   %.fca.0.0.insert = insertvalue %struct.int16x4x4_t undef, <4 x i16> %vld4.fca.0.extract, 0, 0
 578   %.fca.0.1.insert = insertvalue %struct.int16x4x4_t %.fca.0.0.insert, <4 x i16> %vld4.fca.1.extract, 0, 1
 579   %.fca.0.2.insert = insertvalue %struct.int16x4x4_t %.fca.0.1.insert, <4 x i16> %vld4.fca.2.extract, 0, 2
 580   %.fca.0.3.insert = insertvalue %struct.int16x4x4_t %.fca.0.2.insert, <4 x i16> %vld4.fca.3.extract, 0, 3
 581   ret %struct.int16x4x4_t %.fca.0.3.insert
 582 }
 583
 584 define %struct.int32x2x4_t @test_vld4_s32(i32* readonly %a) {
 585 ; CHECK: test_vld4_s32
 586 ; CHECK: ld4 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
 587   %1 = bitcast i32* %a to i8*
 588   %vld4 = tail call { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4.v2i32(i8* %1, i32 4)
 589   %vld4.fca.0.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4, 0
 590   %vld4.fca.1.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4, 1
 591   %vld4.fca.2.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4, 2
 592   %vld4.fca.3.extract = extractvalue { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } %vld4, 3
 593   %.fca.0.0.insert = insertvalue %struct.int32x2x4_t undef, <2 x i32> %vld4.fca.0.extract, 0, 0
 594   %.fca.0.1.insert = insertvalue %struct.int32x2x4_t %.fca.0.0.insert, <2 x i32> %vld4.fca.1.extract, 0, 1
 595   %.fca.0.2.insert = insertvalue %struct.int32x2x4_t %.fca.0.1.insert, <2 x i32> %vld4.fca.2.extract, 0, 2
 596   %.fca.0.3.insert = insertvalue %struct.int32x2x4_t %.fca.0.2.insert, <2 x i32> %vld4.fca.3.extract, 0, 3
 597   ret %struct.int32x2x4_t %.fca.0.3.insert
 598 }
 599
 600 define %struct.int64x1x4_t @test_vld4_s64(i64* readonly %a) {
 601 ; CHECK: test_vld4_s64
 602 ; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
 603   %1 = bitcast i64* %a to i8*
 604   %vld4 = tail call { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8* %1, i32 8)
 605   %vld4.fca.0.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4, 0
 606   %vld4.fca.1.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4, 1
 607   %vld4.fca.2.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4, 2
 608   %vld4.fca.3.extract = extractvalue { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } %vld4, 3
 609   %.fca.0.0.insert = insertvalue %struct.int64x1x4_t undef, <1 x i64> %vld4.fca.0.extract, 0, 0
 610   %.fca.0.1.insert = insertvalue %struct.int64x1x4_t %.fca.0.0.insert, <1 x i64> %vld4.fca.1.extract, 0, 1
 611   %.fca.0.2.insert = insertvalue %struct.int64x1x4_t %.fca.0.1.insert, <1 x i64> %vld4.fca.2.extract, 0, 2
 612   %.fca.0.3.insert = insertvalue %struct.int64x1x4_t %.fca.0.2.insert, <1 x i64> %vld4.fca.3.extract, 0, 3
 613   ret %struct.int64x1x4_t %.fca.0.3.insert
 614 }
 615
 616 define %struct.float32x2x4_t @test_vld4_f32(float* readonly %a) {
 617 ; CHECK: test_vld4_f32
 618 ; CHECK: ld4 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [x{{[0-9]+|sp}}]
 619   %1 = bitcast float* %a to i8*
 620   %vld4 = tail call { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4.v2f32(i8* %1, i32 4)
 621   %vld4.fca.0.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4, 0
 622   %vld4.fca.1.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4, 1
 623   %vld4.fca.2.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4, 2
 624   %vld4.fca.3.extract = extractvalue { <2 x float>, <2 x float>, <2 x float>, <2 x float> } %vld4, 3
 625   %.fca.0.0.insert = insertvalue %struct.float32x2x4_t undef, <2 x float> %vld4.fca.0.extract, 0, 0
 626   %.fca.0.1.insert = insertvalue %struct.float32x2x4_t %.fca.0.0.insert, <2 x float> %vld4.fca.1.extract, 0, 1
 627   %.fca.0.2.insert = insertvalue %struct.float32x2x4_t %.fca.0.1.insert, <2 x float> %vld4.fca.2.extract, 0, 2
 628   %.fca.0.3.insert = insertvalue %struct.float32x2x4_t %.fca.0.2.insert, <2 x float> %vld4.fca.3.extract, 0, 3
 629   ret %struct.float32x2x4_t %.fca.0.3.insert
 630 }
 631
 632 define %struct.float64x1x4_t @test_vld4_f64(double* readonly %a) {
 633 ; CHECK: test_vld4_f64
 634 ; CHECK: ld1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [x{{[0-9]+|sp}}]
 635   %1 = bitcast double* %a to i8*
 636   %vld4 = tail call { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8* %1, i32 8)
 637   %vld4.fca.0.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld4, 0
 638   %vld4.fca.1.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld4, 1
 639   %vld4.fca.2.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld4, 2
 640   %vld4.fca.3.extract = extractvalue { <1 x double>, <1 x double>, <1 x double>, <1 x double> } %vld4, 3
 641   %.fca.0.0.insert = insertvalue %struct.float64x1x4_t undef, <1 x double> %vld4.fca.0.extract, 0, 0
 642   %.fca.0.1.insert = insertvalue %struct.float64x1x4_t %.fca.0.0.insert, <1 x double> %vld4.fca.1.extract, 0, 1
 643   %.fca.0.2.insert = insertvalue %struct.float64x1x4_t %.fca.0.1.insert, <1 x double> %vld4.fca.2.extract, 0, 2
 644   %.fca.0.3.insert = insertvalue %struct.float64x1x4_t %.fca.0.2.insert, <1 x double> %vld4.fca.3.extract, 0, 3
 645   ret %struct.float64x1x4_t %.fca.0.3.insert
 646 }
 647
 648 declare <16 x i8> @llvm.arm.neon.vld1.v16i8(i8*, i32)
 649 declare <8 x i16> @llvm.arm.neon.vld1.v8i16(i8*, i32)
 650 declare <4 x i32> @llvm.arm.neon.vld1.v4i32(i8*, i32)
 651 declare <2 x i64> @llvm.arm.neon.vld1.v2i64(i8*, i32)
 652 declare <4 x float> @llvm.arm.neon.vld1.v4f32(i8*, i32)
 653 declare <2 x double> @llvm.arm.neon.vld1.v2f64(i8*, i32)
 654 declare <8 x i8> @llvm.arm.neon.vld1.v8i8(i8*, i32)
 655 declare <4 x i16> @llvm.arm.neon.vld1.v4i16(i8*, i32)
 656 declare <2 x i32> @llvm.arm.neon.vld1.v2i32(i8*, i32)
 657 declare <1 x i64> @llvm.arm.neon.vld1.v1i64(i8*, i32)
 658 declare <2 x float> @llvm.arm.neon.vld1.v2f32(i8*, i32)
 659 declare <1 x double> @llvm.arm.neon.vld1.v1f64(i8*, i32)
 660 declare { <16 x i8>, <16 x i8> } @llvm.arm.neon.vld2.v16i8(i8*, i32)
 661 declare { <8 x i16>, <8 x i16> } @llvm.arm.neon.vld2.v8i16(i8*, i32)
 662 declare { <4 x i32>, <4 x i32> } @llvm.arm.neon.vld2.v4i32(i8*, i32)
 663 declare { <2 x i64>, <2 x i64> } @llvm.arm.neon.vld2.v2i64(i8*, i32)
 664 declare { <4 x float>, <4 x float> } @llvm.arm.neon.vld2.v4f32(i8*, i32)
 665 declare { <2 x double>, <2 x double> } @llvm.arm.neon.vld2.v2f64(i8*, i32)
 666 declare { <8 x i8>, <8 x i8> } @llvm.arm.neon.vld2.v8i8(i8*, i32)
 667 declare { <4 x i16>, <4 x i16> } @llvm.arm.neon.vld2.v4i16(i8*, i32)
 668 declare { <2 x i32>, <2 x i32> } @llvm.arm.neon.vld2.v2i32(i8*, i32)
 669 declare { <1 x i64>, <1 x i64> } @llvm.arm.neon.vld2.v1i64(i8*, i32)
 670 declare { <2 x float>, <2 x float> } @llvm.arm.neon.vld2.v2f32(i8*, i32)
 671 declare { <1 x double>, <1 x double> } @llvm.arm.neon.vld2.v1f64(i8*, i32)
 672 declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8*, i32)
 673 declare { <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld3.v8i16(i8*, i32)
 674 declare { <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld3.v4i32(i8*, i32)
 675 declare { <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld3.v2i64(i8*, i32)
 676 declare { <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld3.v4f32(i8*, i32)
 677 declare { <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld3.v2f64(i8*, i32)
 678 declare { <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld3.v8i8(i8*, i32)
 679 declare { <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld3.v4i16(i8*, i32)
 680 declare { <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld3.v2i32(i8*, i32)
 681 declare { <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld3.v1i64(i8*, i32)
 682 declare { <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld3.v2f32(i8*, i32)
 683 declare { <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld3.v1f64(i8*, i32)
 684 declare { <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld4.v16i8(i8*, i32)
 685 declare { <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16> } @llvm.arm.neon.vld4.v8i16(i8*, i32)
 686 declare { <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32> } @llvm.arm.neon.vld4.v4i32(i8*, i32)
 687 declare { <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64> } @llvm.arm.neon.vld4.v2i64(i8*, i32)
 688 declare { <4 x float>, <4 x float>, <4 x float>, <4 x float> } @llvm.arm.neon.vld4.v4f32(i8*, i32)
 689 declare { <2 x double>, <2 x double>, <2 x double>, <2 x double> } @llvm.arm.neon.vld4.v2f64(i8*, i32)
 690 declare { <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8> } @llvm.arm.neon.vld4.v8i8(i8*, i32)
 691 declare { <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16> } @llvm.arm.neon.vld4.v4i16(i8*, i32)
 692 declare { <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32> } @llvm.arm.neon.vld4.v2i32(i8*, i32)
 693 declare { <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64> } @llvm.arm.neon.vld4.v1i64(i8*, i32)
 694 declare { <2 x float>, <2 x float>, <2 x float>, <2 x float> } @llvm.arm.neon.vld4.v2f32(i8*, i32)
 695 declare { <1 x double>, <1 x double>, <1 x double>, <1 x double> } @llvm.arm.neon.vld4.v1f64(i8*, i32)
 696
 697 define void @test_vst1q_s8(i8* %a, <16 x i8> %b) {
 698 ; CHECK: test_vst1q_s8
 699 ; CHECK: st1 {v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
 700   tail call void @llvm.arm.neon.vst1.v16i8(i8* %a, <16 x i8> %b, i32 1)
 701   ret void
 702 }
 703
 704 define void @test_vst1q_s16(i16* %a, <8 x i16> %b) {
 705 ; CHECK: test_vst1q_s16
 706 ; CHECK: st1 {v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
 707   %1 = bitcast i16* %a to i8*
 708   tail call void @llvm.arm.neon.vst1.v8i16(i8* %1, <8 x i16> %b, i32 2)
 709   ret void
 710 }
 711
 712 define void @test_vst1q_s32(i32* %a, <4 x i32> %b) {
 713 ; CHECK: test_vst1q_s32
 714 ; CHECK: st1 {v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
 715   %1 = bitcast i32* %a to i8*
 716   tail call void @llvm.arm.neon.vst1.v4i32(i8* %1, <4 x i32> %b, i32 4)
 717   ret void
 718 }
 719
 720 define void @test_vst1q_s64(i64* %a, <2 x i64> %b) {
 721 ; CHECK: test_vst1q_s64
 722 ; CHECK: st1 {v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
 723   %1 = bitcast i64* %a to i8*
 724   tail call void @llvm.arm.neon.vst1.v2i64(i8* %1, <2 x i64> %b, i32 8)
 725   ret void
 726 }
 727
 728 define void @test_vst1q_f32(float* %a, <4 x float> %b) {
 729 ; CHECK: test_vst1q_f32
 730 ; CHECK: st1 {v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
 731   %1 = bitcast float* %a to i8*
 732   tail call void @llvm.arm.neon.vst1.v4f32(i8* %1, <4 x float> %b, i32 4)
 733   ret void
 734 }
 735
 736 define void @test_vst1q_f64(double* %a, <2 x double> %b) {
 737 ; CHECK: test_vst1q_f64
 738 ; CHECK: st1 {v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
 739   %1 = bitcast double* %a to i8*
 740   tail call void @llvm.arm.neon.vst1.v2f64(i8* %1, <2 x double> %b, i32 8)
 741   ret void
 742 }
 743
 744 define void @test_vst1_s8(i8* %a, <8 x i8> %b) {
 745 ; CHECK: test_vst1_s8
 746 ; CHECK: st1 {v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
 747   tail call void @llvm.arm.neon.vst1.v8i8(i8* %a, <8 x i8> %b, i32 1)
 748   ret void
 749 }
 750
 751 define void @test_vst1_s16(i16* %a, <4 x i16> %b) {
 752 ; CHECK: test_vst1_s16
 753 ; CHECK: st1 {v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
 754   %1 = bitcast i16* %a to i8*
 755   tail call void @llvm.arm.neon.vst1.v4i16(i8* %1, <4 x i16> %b, i32 2)
 756   ret void
 757 }
 758
 759 define void @test_vst1_s32(i32* %a, <2 x i32> %b) {
 760 ; CHECK: test_vst1_s32
 761 ; CHECK: st1 {v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
 762   %1 = bitcast i32* %a to i8*
 763   tail call void @llvm.arm.neon.vst1.v2i32(i8* %1, <2 x i32> %b, i32 4)
 764   ret void
 765 }
 766
 767 define void @test_vst1_s64(i64* %a, <1 x i64> %b) {
 768 ; CHECK: test_vst1_s64
 769 ; CHECK: st1 {v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
 770   %1 = bitcast i64* %a to i8*
 771   tail call void @llvm.arm.neon.vst1.v1i64(i8* %1, <1 x i64> %b, i32 8)
 772   ret void
 773 }
 774
 775 define void @test_vst1_f32(float* %a, <2 x float> %b) {
 776 ; CHECK: test_vst1_f32
 777 ; CHECK: st1 {v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
 778   %1 = bitcast float* %a to i8*
 779   tail call void @llvm.arm.neon.vst1.v2f32(i8* %1, <2 x float> %b, i32 4)
 780   ret void
 781 }
 782
 783 define void @test_vst1_f64(double* %a, <1 x double> %b) {
 784 ; CHECK: test_vst1_f64
 785 ; CHECK: st1 {v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
 786   %1 = bitcast double* %a to i8*
 787   tail call void @llvm.arm.neon.vst1.v1f64(i8* %1, <1 x double> %b, i32 8)
 788   ret void
 789 }
 790
 791 define void @test_vst2q_s8(i8* %a, [2 x <16 x i8>] %b.coerce) {
 792 ; CHECK: test_vst2q_s8
 793 ; CHECK: st2 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
 794   %b.coerce.fca.0.extract = extractvalue [2 x <16 x i8>] %b.coerce, 0
 795   %b.coerce.fca.1.extract = extractvalue [2 x <16 x i8>] %b.coerce, 1
 796   tail call void @llvm.arm.neon.vst2.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, i32 1)
 797   ret void
 798 }
 799
 800 define void @test_vst2q_s16(i16* %a, [2 x <8 x i16>] %b.coerce) {
 801 ; CHECK: test_vst2q_s16
 802 ; CHECK: st2 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
 803   %b.coerce.fca.0.extract = extractvalue [2 x <8 x i16>] %b.coerce, 0
 804   %b.coerce.fca.1.extract = extractvalue [2 x <8 x i16>] %b.coerce, 1
 805   %1 = bitcast i16* %a to i8*
 806   tail call void @llvm.arm.neon.vst2.v8i16(i8* %1, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, i32 2)
 807   ret void
 808 }
 809
 810 define void @test_vst2q_s32(i32* %a, [2 x <4 x i32>] %b.coerce) {
 811 ; CHECK: test_vst2q_s32
 812 ; CHECK: st2 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
 813   %b.coerce.fca.0.extract = extractvalue [2 x <4 x i32>] %b.coerce, 0
 814   %b.coerce.fca.1.extract = extractvalue [2 x <4 x i32>] %b.coerce, 1
 815   %1 = bitcast i32* %a to i8*
 816   tail call void @llvm.arm.neon.vst2.v4i32(i8* %1, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, i32 4)
 817   ret void
 818 }
 819
 820 define void @test_vst2q_s64(i64* %a, [2 x <2 x i64>] %b.coerce) {
 821 ; CHECK: test_vst2q_s64
 822 ; CHECK: st2 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
 823   %b.coerce.fca.0.extract = extractvalue [2 x <2 x i64>] %b.coerce, 0
 824   %b.coerce.fca.1.extract = extractvalue [2 x <2 x i64>] %b.coerce, 1
 825   %1 = bitcast i64* %a to i8*
 826   tail call void @llvm.arm.neon.vst2.v2i64(i8* %1, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, i32 8)
 827   ret void
 828 }
 829
 830 define void @test_vst2q_f32(float* %a, [2 x <4 x float>] %b.coerce) {
 831 ; CHECK: test_vst2q_f32
 832 ; CHECK: st2 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
 833   %b.coerce.fca.0.extract = extractvalue [2 x <4 x float>] %b.coerce, 0
 834   %b.coerce.fca.1.extract = extractvalue [2 x <4 x float>] %b.coerce, 1
 835   %1 = bitcast float* %a to i8*
 836   tail call void @llvm.arm.neon.vst2.v4f32(i8* %1, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, i32 4)
 837   ret void
 838 }
 839
 840 define void @test_vst2q_f64(double* %a, [2 x <2 x double>] %b.coerce) {
 841 ; CHECK: test_vst2q_f64
 842 ; CHECK: st2 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
 843   %b.coerce.fca.0.extract = extractvalue [2 x <2 x double>] %b.coerce, 0
 844   %b.coerce.fca.1.extract = extractvalue [2 x <2 x double>] %b.coerce, 1
 845   %1 = bitcast double* %a to i8*
 846   tail call void @llvm.arm.neon.vst2.v2f64(i8* %1, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, i32 8)
 847   ret void
 848 }
 849
 850 define void @test_vst2_s8(i8* %a, [2 x <8 x i8>] %b.coerce) {
 851 ; CHECK: test_vst2_s8
 852 ; CHECK: st2 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
 853   %b.coerce.fca.0.extract = extractvalue [2 x <8 x i8>] %b.coerce, 0
 854   %b.coerce.fca.1.extract = extractvalue [2 x <8 x i8>] %b.coerce, 1
 855   tail call void @llvm.arm.neon.vst2.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, i32 1)
 856   ret void
 857 }
 858
 859 define void @test_vst2_s16(i16* %a, [2 x <4 x i16>] %b.coerce) {
 860 ; CHECK: test_vst2_s16
 861 ; CHECK: st2 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
 862   %b.coerce.fca.0.extract = extractvalue [2 x <4 x i16>] %b.coerce, 0
 863   %b.coerce.fca.1.extract = extractvalue [2 x <4 x i16>] %b.coerce, 1
 864   %1 = bitcast i16* %a to i8*
 865   tail call void @llvm.arm.neon.vst2.v4i16(i8* %1, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, i32 2)
 866   ret void
 867 }
 868
 869 define void @test_vst2_s32(i32* %a, [2 x <2 x i32>] %b.coerce) {
 870 ; CHECK: test_vst2_s32
 871 ; CHECK: st2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
 872   %b.coerce.fca.0.extract = extractvalue [2 x <2 x i32>] %b.coerce, 0
 873   %b.coerce.fca.1.extract = extractvalue [2 x <2 x i32>] %b.coerce, 1
 874   %1 = bitcast i32* %a to i8*
 875   tail call void @llvm.arm.neon.vst2.v2i32(i8* %1, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, i32 4)
 876   ret void
 877 }
 878
 879 define void @test_vst2_s64(i64* %a, [2 x <1 x i64>] %b.coerce) {
 880 ; CHECK: test_vst2_s64
 881 ; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
 882   %b.coerce.fca.0.extract = extractvalue [2 x <1 x i64>] %b.coerce, 0
 883   %b.coerce.fca.1.extract = extractvalue [2 x <1 x i64>] %b.coerce, 1
 884   %1 = bitcast i64* %a to i8*
 885   tail call void @llvm.arm.neon.vst2.v1i64(i8* %1, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, i32 8)
 886   ret void
 887 }
 888
 889 define void @test_vst2_f32(float* %a, [2 x <2 x float>] %b.coerce) {
 890 ; CHECK: test_vst2_f32
 891 ; CHECK: st2 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
 892   %b.coerce.fca.0.extract = extractvalue [2 x <2 x float>] %b.coerce, 0
 893   %b.coerce.fca.1.extract = extractvalue [2 x <2 x float>] %b.coerce, 1
 894   %1 = bitcast float* %a to i8*
 895   tail call void @llvm.arm.neon.vst2.v2f32(i8* %1, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, i32 4)
 896   ret void
 897 }
 898
 899 define void @test_vst2_f64(double* %a, [2 x <1 x double>] %b.coerce) {
 900 ; CHECK: test_vst2_f64
 901 ; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
 902   %b.coerce.fca.0.extract = extractvalue [2 x <1 x double>] %b.coerce, 0
 903   %b.coerce.fca.1.extract = extractvalue [2 x <1 x double>] %b.coerce, 1
 904   %1 = bitcast double* %a to i8*
 905   tail call void @llvm.arm.neon.vst2.v1f64(i8* %1, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, i32 8)
 906   ret void
 907 }
 908
 909 define void @test_vst3q_s8(i8* %a, [3 x <16 x i8>] %b.coerce) {
 910 ; CHECK: test_vst3q_s8
 911 ; CHECK: st3 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
 912   %b.coerce.fca.0.extract = extractvalue [3 x <16 x i8>] %b.coerce, 0
 913   %b.coerce.fca.1.extract = extractvalue [3 x <16 x i8>] %b.coerce, 1
 914   %b.coerce.fca.2.extract = extractvalue [3 x <16 x i8>] %b.coerce, 2
 915   tail call void @llvm.arm.neon.vst3.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, i32 1)
 916   ret void
 917 }
 918
 919 define void @test_vst3q_s16(i16* %a, [3 x <8 x i16>] %b.coerce) {
 920 ; CHECK: test_vst3q_s16
 921 ; CHECK: st3 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
 922   %b.coerce.fca.0.extract = extractvalue [3 x <8 x i16>] %b.coerce, 0
 923   %b.coerce.fca.1.extract = extractvalue [3 x <8 x i16>] %b.coerce, 1
 924   %b.coerce.fca.2.extract = extractvalue [3 x <8 x i16>] %b.coerce, 2
 925   %1 = bitcast i16* %a to i8*
 926   tail call void @llvm.arm.neon.vst3.v8i16(i8* %1, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, i32 2)
 927   ret void
 928 }
 929
 930 define void @test_vst3q_s32(i32* %a, [3 x <4 x i32>] %b.coerce) {
 931 ; CHECK: test_vst3q_s32
 932 ; CHECK: st3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
 933   %b.coerce.fca.0.extract = extractvalue [3 x <4 x i32>] %b.coerce, 0
 934   %b.coerce.fca.1.extract = extractvalue [3 x <4 x i32>] %b.coerce, 1
 935   %b.coerce.fca.2.extract = extractvalue [3 x <4 x i32>] %b.coerce, 2
 936   %1 = bitcast i32* %a to i8*
 937   tail call void @llvm.arm.neon.vst3.v4i32(i8* %1, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, i32 4)
 938   ret void
 939 }
 940
 941 define void @test_vst3q_s64(i64* %a, [3 x <2 x i64>] %b.coerce) {
 942 ; CHECK: test_vst3q_s64
 943 ; CHECK: st3 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
 944   %b.coerce.fca.0.extract = extractvalue [3 x <2 x i64>] %b.coerce, 0
 945   %b.coerce.fca.1.extract = extractvalue [3 x <2 x i64>] %b.coerce, 1
 946   %b.coerce.fca.2.extract = extractvalue [3 x <2 x i64>] %b.coerce, 2
 947   %1 = bitcast i64* %a to i8*
 948   tail call void @llvm.arm.neon.vst3.v2i64(i8* %1, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, i32 8)
 949   ret void
 950 }
 951
 952 define void @test_vst3q_f32(float* %a, [3 x <4 x float>] %b.coerce) {
 953 ; CHECK: test_vst3q_f32
 954 ; CHECK: st3 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
 955   %b.coerce.fca.0.extract = extractvalue [3 x <4 x float>] %b.coerce, 0
 956   %b.coerce.fca.1.extract = extractvalue [3 x <4 x float>] %b.coerce, 1
 957   %b.coerce.fca.2.extract = extractvalue [3 x <4 x float>] %b.coerce, 2
 958   %1 = bitcast float* %a to i8*
 959   tail call void @llvm.arm.neon.vst3.v4f32(i8* %1, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, i32 4)
 960   ret void
 961 }
 962
 963 define void @test_vst3q_f64(double* %a, [3 x <2 x double>] %b.coerce) {
 964 ; CHECK: test_vst3q_f64
 965 ; CHECK: st3 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
 966   %b.coerce.fca.0.extract = extractvalue [3 x <2 x double>] %b.coerce, 0
 967   %b.coerce.fca.1.extract = extractvalue [3 x <2 x double>] %b.coerce, 1
 968   %b.coerce.fca.2.extract = extractvalue [3 x <2 x double>] %b.coerce, 2
 969   %1 = bitcast double* %a to i8*
 970   tail call void @llvm.arm.neon.vst3.v2f64(i8* %1, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, i32 8)
 971   ret void
 972 }
 973
 974 define void @test_vst3_s8(i8* %a, [3 x <8 x i8>] %b.coerce) {
 975 ; CHECK: test_vst3_s8
 976 ; CHECK: st3 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
 977   %b.coerce.fca.0.extract = extractvalue [3 x <8 x i8>] %b.coerce, 0
 978   %b.coerce.fca.1.extract = extractvalue [3 x <8 x i8>] %b.coerce, 1
 979   %b.coerce.fca.2.extract = extractvalue [3 x <8 x i8>] %b.coerce, 2
 980   tail call void @llvm.arm.neon.vst3.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, i32 1)
 981   ret void
 982 }
 983
 984 define void @test_vst3_s16(i16* %a, [3 x <4 x i16>] %b.coerce) {
 985 ; CHECK: test_vst3_s16
 986 ; CHECK: st3 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
 987   %b.coerce.fca.0.extract = extractvalue [3 x <4 x i16>] %b.coerce, 0
 988   %b.coerce.fca.1.extract = extractvalue [3 x <4 x i16>] %b.coerce, 1
 989   %b.coerce.fca.2.extract = extractvalue [3 x <4 x i16>] %b.coerce, 2
 990   %1 = bitcast i16* %a to i8*
 991   tail call void @llvm.arm.neon.vst3.v4i16(i8* %1, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, i32 2)
 992   ret void
 993 }
 994
 995 define void @test_vst3_s32(i32* %a, [3 x <2 x i32>] %b.coerce) {
 996 ; CHECK: test_vst3_s32
 997 ; CHECK: st3 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
 998   %b.coerce.fca.0.extract = extractvalue [3 x <2 x i32>] %b.coerce, 0
 999   %b.coerce.fca.1.extract = extractvalue [3 x <2 x i32>] %b.coerce, 1
1000   %b.coerce.fca.2.extract = extractvalue [3 x <2 x i32>] %b.coerce, 2
1001   %1 = bitcast i32* %a to i8*
1002   tail call void @llvm.arm.neon.vst3.v2i32(i8* %1, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, i32 4)
1003   ret void
1004 }
1005
1006 define void @test_vst3_s64(i64* %a, [3 x <1 x i64>] %b.coerce) {
1007 ; CHECK: test_vst3_s64
1008 ; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
1009   %b.coerce.fca.0.extract = extractvalue [3 x <1 x i64>] %b.coerce, 0
1010   %b.coerce.fca.1.extract = extractvalue [3 x <1 x i64>] %b.coerce, 1
1011   %b.coerce.fca.2.extract = extractvalue [3 x <1 x i64>] %b.coerce, 2
1012   %1 = bitcast i64* %a to i8*
1013   tail call void @llvm.arm.neon.vst3.v1i64(i8* %1, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, i32 8)
1014   ret void
1015 }
1016
1017 define void @test_vst3_f32(float* %a, [3 x <2 x float>] %b.coerce) {
1018 ; CHECK: test_vst3_f32
1019 ; CHECK: st3 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
1020   %b.coerce.fca.0.extract = extractvalue [3 x <2 x float>] %b.coerce, 0
1021   %b.coerce.fca.1.extract = extractvalue [3 x <2 x float>] %b.coerce, 1
1022   %b.coerce.fca.2.extract = extractvalue [3 x <2 x float>] %b.coerce, 2
1023   %1 = bitcast float* %a to i8*
1024   tail call void @llvm.arm.neon.vst3.v2f32(i8* %1, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, i32 4)
1025   ret void
1026 }
1027
1028 define void @test_vst3_f64(double* %a, [3 x <1 x double>] %b.coerce) {
1029 ; CHECK: test_vst3_f64
1030 ; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
1031   %b.coerce.fca.0.extract = extractvalue [3 x <1 x double>] %b.coerce, 0
1032   %b.coerce.fca.1.extract = extractvalue [3 x <1 x double>] %b.coerce, 1
1033   %b.coerce.fca.2.extract = extractvalue [3 x <1 x double>] %b.coerce, 2
1034   %1 = bitcast double* %a to i8*
1035   tail call void @llvm.arm.neon.vst3.v1f64(i8* %1, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, i32 8)
1036   ret void
1037 }
1038
1039 define void @test_vst4q_s8(i8* %a, [4 x <16 x i8>] %b.coerce) {
1040 ; CHECK: test_vst4q_s8
1041 ; CHECK: st4 {v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b, v{{[0-9]+}}.16b}, [{{x[0-9]+|sp}}]
1042   %b.coerce.fca.0.extract = extractvalue [4 x <16 x i8>] %b.coerce, 0
1043   %b.coerce.fca.1.extract = extractvalue [4 x <16 x i8>] %b.coerce, 1
1044   %b.coerce.fca.2.extract = extractvalue [4 x <16 x i8>] %b.coerce, 2
1045   %b.coerce.fca.3.extract = extractvalue [4 x <16 x i8>] %b.coerce, 3
1046   tail call void @llvm.arm.neon.vst4.v16i8(i8* %a, <16 x i8> %b.coerce.fca.0.extract, <16 x i8> %b.coerce.fca.1.extract, <16 x i8> %b.coerce.fca.2.extract, <16 x i8> %b.coerce.fca.3.extract, i32 1)
1047   ret void
1048 }
1049
1050 define void @test_vst4q_s16(i16* %a, [4 x <8 x i16>] %b.coerce) {
1051 ; CHECK: test_vst4q_s16
1052 ; CHECK: st4 {v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h, v{{[0-9]+}}.8h}, [{{x[0-9]+|sp}}]
1053   %b.coerce.fca.0.extract = extractvalue [4 x <8 x i16>] %b.coerce, 0
1054   %b.coerce.fca.1.extract = extractvalue [4 x <8 x i16>] %b.coerce, 1
1055   %b.coerce.fca.2.extract = extractvalue [4 x <8 x i16>] %b.coerce, 2
1056   %b.coerce.fca.3.extract = extractvalue [4 x <8 x i16>] %b.coerce, 3
1057   %1 = bitcast i16* %a to i8*
1058   tail call void @llvm.arm.neon.vst4.v8i16(i8* %1, <8 x i16> %b.coerce.fca.0.extract, <8 x i16> %b.coerce.fca.1.extract, <8 x i16> %b.coerce.fca.2.extract, <8 x i16> %b.coerce.fca.3.extract, i32 2)
1059   ret void
1060 }
1061
1062 define void @test_vst4q_s32(i32* %a, [4 x <4 x i32>] %b.coerce) {
1063 ; CHECK: test_vst4q_s32
1064 ; CHECK: st4 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
1065   %b.coerce.fca.0.extract = extractvalue [4 x <4 x i32>] %b.coerce, 0
1066   %b.coerce.fca.1.extract = extractvalue [4 x <4 x i32>] %b.coerce, 1
1067   %b.coerce.fca.2.extract = extractvalue [4 x <4 x i32>] %b.coerce, 2
1068   %b.coerce.fca.3.extract = extractvalue [4 x <4 x i32>] %b.coerce, 3
1069   %1 = bitcast i32* %a to i8*
1070   tail call void @llvm.arm.neon.vst4.v4i32(i8* %1, <4 x i32> %b.coerce.fca.0.extract, <4 x i32> %b.coerce.fca.1.extract, <4 x i32> %b.coerce.fca.2.extract, <4 x i32> %b.coerce.fca.3.extract, i32 4)
1071   ret void
1072 }
1073
1074 define void @test_vst4q_s64(i64* %a, [4 x <2 x i64>] %b.coerce) {
1075 ; CHECK: test_vst4q_s64
1076 ; CHECK: st4 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
1077   %b.coerce.fca.0.extract = extractvalue [4 x <2 x i64>] %b.coerce, 0
1078   %b.coerce.fca.1.extract = extractvalue [4 x <2 x i64>] %b.coerce, 1
1079   %b.coerce.fca.2.extract = extractvalue [4 x <2 x i64>] %b.coerce, 2
1080   %b.coerce.fca.3.extract = extractvalue [4 x <2 x i64>] %b.coerce, 3
1081   %1 = bitcast i64* %a to i8*
1082   tail call void @llvm.arm.neon.vst4.v2i64(i8* %1, <2 x i64> %b.coerce.fca.0.extract, <2 x i64> %b.coerce.fca.1.extract, <2 x i64> %b.coerce.fca.2.extract, <2 x i64> %b.coerce.fca.3.extract, i32 8)
1083   ret void
1084 }
1085
1086 define void @test_vst4q_f32(float* %a, [4 x <4 x float>] %b.coerce) {
1087 ; CHECK: test_vst4q_f32
1088 ; CHECK: st4 {v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s, v{{[0-9]+}}.4s}, [{{x[0-9]+|sp}}]
1089   %b.coerce.fca.0.extract = extractvalue [4 x <4 x float>] %b.coerce, 0
1090   %b.coerce.fca.1.extract = extractvalue [4 x <4 x float>] %b.coerce, 1
1091   %b.coerce.fca.2.extract = extractvalue [4 x <4 x float>] %b.coerce, 2
1092   %b.coerce.fca.3.extract = extractvalue [4 x <4 x float>] %b.coerce, 3
1093   %1 = bitcast float* %a to i8*
1094   tail call void @llvm.arm.neon.vst4.v4f32(i8* %1, <4 x float> %b.coerce.fca.0.extract, <4 x float> %b.coerce.fca.1.extract, <4 x float> %b.coerce.fca.2.extract, <4 x float> %b.coerce.fca.3.extract, i32 4)
1095   ret void
1096 }
1097
1098 define void @test_vst4q_f64(double* %a, [4 x <2 x double>] %b.coerce) {
1099 ; CHECK: test_vst4q_f64
1100 ; CHECK: st4 {v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d, v{{[0-9]+}}.2d}, [{{x[0-9]+|sp}}]
1101   %b.coerce.fca.0.extract = extractvalue [4 x <2 x double>] %b.coerce, 0
1102   %b.coerce.fca.1.extract = extractvalue [4 x <2 x double>] %b.coerce, 1
1103   %b.coerce.fca.2.extract = extractvalue [4 x <2 x double>] %b.coerce, 2
1104   %b.coerce.fca.3.extract = extractvalue [4 x <2 x double>] %b.coerce, 3
1105   %1 = bitcast double* %a to i8*
1106   tail call void @llvm.arm.neon.vst4.v2f64(i8* %1, <2 x double> %b.coerce.fca.0.extract, <2 x double> %b.coerce.fca.1.extract, <2 x double> %b.coerce.fca.2.extract, <2 x double> %b.coerce.fca.3.extract, i32 8)
1107   ret void
1108 }
1109
1110 define void @test_vst4_s8(i8* %a, [4 x <8 x i8>] %b.coerce) {
1111 ; CHECK: test_vst4_s8
1112 ; CHECK: st4 {v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b, v{{[0-9]+}}.8b}, [{{x[0-9]+|sp}}]
1113   %b.coerce.fca.0.extract = extractvalue [4 x <8 x i8>] %b.coerce, 0
1114   %b.coerce.fca.1.extract = extractvalue [4 x <8 x i8>] %b.coerce, 1
1115   %b.coerce.fca.2.extract = extractvalue [4 x <8 x i8>] %b.coerce, 2
1116   %b.coerce.fca.3.extract = extractvalue [4 x <8 x i8>] %b.coerce, 3
1117   tail call void @llvm.arm.neon.vst4.v8i8(i8* %a, <8 x i8> %b.coerce.fca.0.extract, <8 x i8> %b.coerce.fca.1.extract, <8 x i8> %b.coerce.fca.2.extract, <8 x i8> %b.coerce.fca.3.extract, i32 1)
1118   ret void
1119 }
1120
1121 define void @test_vst4_s16(i16* %a, [4 x <4 x i16>] %b.coerce) {
1122 ; CHECK: test_vst4_s16
1123 ; CHECK: st4 {v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h, v{{[0-9]+}}.4h}, [{{x[0-9]+|sp}}]
1124   %b.coerce.fca.0.extract = extractvalue [4 x <4 x i16>] %b.coerce, 0
1125   %b.coerce.fca.1.extract = extractvalue [4 x <4 x i16>] %b.coerce, 1
1126   %b.coerce.fca.2.extract = extractvalue [4 x <4 x i16>] %b.coerce, 2
1127   %b.coerce.fca.3.extract = extractvalue [4 x <4 x i16>] %b.coerce, 3
1128   %1 = bitcast i16* %a to i8*
1129   tail call void @llvm.arm.neon.vst4.v4i16(i8* %1, <4 x i16> %b.coerce.fca.0.extract, <4 x i16> %b.coerce.fca.1.extract, <4 x i16> %b.coerce.fca.2.extract, <4 x i16> %b.coerce.fca.3.extract, i32 2)
1130   ret void
1131 }
1132
1133 define void @test_vst4_s32(i32* %a, [4 x <2 x i32>] %b.coerce) {
1134 ; CHECK: test_vst4_s32
1135 ; CHECK: st4 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
1136   %b.coerce.fca.0.extract = extractvalue [4 x <2 x i32>] %b.coerce, 0
1137   %b.coerce.fca.1.extract = extractvalue [4 x <2 x i32>] %b.coerce, 1
1138   %b.coerce.fca.2.extract = extractvalue [4 x <2 x i32>] %b.coerce, 2
1139   %b.coerce.fca.3.extract = extractvalue [4 x <2 x i32>] %b.coerce, 3
1140   %1 = bitcast i32* %a to i8*
1141   tail call void @llvm.arm.neon.vst4.v2i32(i8* %1, <2 x i32> %b.coerce.fca.0.extract, <2 x i32> %b.coerce.fca.1.extract, <2 x i32> %b.coerce.fca.2.extract, <2 x i32> %b.coerce.fca.3.extract, i32 4)
1142   ret void
1143 }
1144
1145 define void @test_vst4_s64(i64* %a, [4 x <1 x i64>] %b.coerce) {
1146 ; CHECK: test_vst4_s64
1147 ; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
1148   %b.coerce.fca.0.extract = extractvalue [4 x <1 x i64>] %b.coerce, 0
1149   %b.coerce.fca.1.extract = extractvalue [4 x <1 x i64>] %b.coerce, 1
1150   %b.coerce.fca.2.extract = extractvalue [4 x <1 x i64>] %b.coerce, 2
1151   %b.coerce.fca.3.extract = extractvalue [4 x <1 x i64>] %b.coerce, 3
1152   %1 = bitcast i64* %a to i8*
1153   tail call void @llvm.arm.neon.vst4.v1i64(i8* %1, <1 x i64> %b.coerce.fca.0.extract, <1 x i64> %b.coerce.fca.1.extract, <1 x i64> %b.coerce.fca.2.extract, <1 x i64> %b.coerce.fca.3.extract, i32 8)
1154   ret void
1155 }
1156
1157 define void @test_vst4_f32(float* %a, [4 x <2 x float>] %b.coerce) {
1158 ; CHECK: test_vst4_f32
1159 ; CHECK: st4 {v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s, v{{[0-9]+}}.2s}, [{{x[0-9]+|sp}}]
1160   %b.coerce.fca.0.extract = extractvalue [4 x <2 x float>] %b.coerce, 0
1161   %b.coerce.fca.1.extract = extractvalue [4 x <2 x float>] %b.coerce, 1
1162   %b.coerce.fca.2.extract = extractvalue [4 x <2 x float>] %b.coerce, 2
1163   %b.coerce.fca.3.extract = extractvalue [4 x <2 x float>] %b.coerce, 3
1164   %1 = bitcast float* %a to i8*
1165   tail call void @llvm.arm.neon.vst4.v2f32(i8* %1, <2 x float> %b.coerce.fca.0.extract, <2 x float> %b.coerce.fca.1.extract, <2 x float> %b.coerce.fca.2.extract, <2 x float> %b.coerce.fca.3.extract, i32 4)
1166   ret void
1167 }
1168
1169 define void @test_vst4_f64(double* %a, [4 x <1 x double>] %b.coerce) {
1170 ; CHECK: test_vst4_f64
1171 ; CHECK: st1 {v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d, v{{[0-9]+}}.1d}, [{{x[0-9]+|sp}}]
1172   %b.coerce.fca.0.extract = extractvalue [4 x <1 x double>] %b.coerce, 0
1173   %b.coerce.fca.1.extract = extractvalue [4 x <1 x double>] %b.coerce, 1
1174   %b.coerce.fca.2.extract = extractvalue [4 x <1 x double>] %b.coerce, 2
1175   %b.coerce.fca.3.extract = extractvalue [4 x <1 x double>] %b.coerce, 3
1176   %1 = bitcast double* %a to i8*
1177   tail call void @llvm.arm.neon.vst4.v1f64(i8* %1, <1 x double> %b.coerce.fca.0.extract, <1 x double> %b.coerce.fca.1.extract, <1 x double> %b.coerce.fca.2.extract, <1 x double> %b.coerce.fca.3.extract, i32 8)
1178   ret void
1179 }
1180
1181 declare void @llvm.arm.neon.vst1.v16i8(i8*, <16 x i8>, i32)
1182 declare void @llvm.arm.neon.vst1.v8i16(i8*, <8 x i16>, i32)
1183 declare void @llvm.arm.neon.vst1.v4i32(i8*, <4 x i32>, i32)
1184 declare void @llvm.arm.neon.vst1.v2i64(i8*, <2 x i64>, i32)
1185 declare void @llvm.arm.neon.vst1.v4f32(i8*, <4 x float>, i32)
1186 declare void @llvm.arm.neon.vst1.v2f64(i8*, <2 x double>, i32)
1187 declare void @llvm.arm.neon.vst1.v8i8(i8*, <8 x i8>, i32)
1188 declare void @llvm.arm.neon.vst1.v4i16(i8*, <4 x i16>, i32)
1189 declare void @llvm.arm.neon.vst1.v2i32(i8*, <2 x i32>, i32)
1190 declare void @llvm.arm.neon.vst1.v1i64(i8*, <1 x i64>, i32)
1191 declare void @llvm.arm.neon.vst1.v2f32(i8*, <2 x float>, i32)
1192 declare void @llvm.arm.neon.vst1.v1f64(i8*, <1 x double>, i32)
1193 declare void @llvm.arm.neon.vst2.v16i8(i8*, <16 x i8>, <16 x i8>, i32)
1194 declare void @llvm.arm.neon.vst2.v8i16(i8*, <8 x i16>, <8 x i16>, i32)
1195 declare void @llvm.arm.neon.vst2.v4i32(i8*, <4 x i32>, <4 x i32>, i32)
1196 declare void @llvm.arm.neon.vst2.v2i64(i8*, <2 x i64>, <2 x i64>, i32)
1197 declare void @llvm.arm.neon.vst2.v4f32(i8*, <4 x float>, <4 x float>, i32)
1198 declare void @llvm.arm.neon.vst2.v2f64(i8*, <2 x double>, <2 x double>, i32)
1199 declare void @llvm.arm.neon.vst2.v8i8(i8*, <8 x i8>, <8 x i8>, i32)
1200 declare void @llvm.arm.neon.vst2.v4i16(i8*, <4 x i16>, <4 x i16>, i32)
1201 declare void @llvm.arm.neon.vst2.v2i32(i8*, <2 x i32>, <2 x i32>, i32)
1202 declare void @llvm.arm.neon.vst2.v1i64(i8*, <1 x i64>, <1 x i64>, i32)
1203 declare void @llvm.arm.neon.vst2.v2f32(i8*, <2 x float>, <2 x float>, i32)
1204 declare void @llvm.arm.neon.vst2.v1f64(i8*, <1 x double>, <1 x double>, i32)
1205 declare void @llvm.arm.neon.vst3.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, i32)
1206 declare void @llvm.arm.neon.vst3.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, i32)
1207 declare void @llvm.arm.neon.vst3.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, i32)
1208 declare void @llvm.arm.neon.vst3.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, i32)
1209 declare void @llvm.arm.neon.vst3.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, i32)
1210 declare void @llvm.arm.neon.vst3.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, i32)
1211 declare void @llvm.arm.neon.vst3.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, i32)
1212 declare void @llvm.arm.neon.vst3.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, i32)
1213 declare void @llvm.arm.neon.vst3.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, i32)
1214 declare void @llvm.arm.neon.vst3.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, i32)
1215 declare void @llvm.arm.neon.vst3.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, i32)
1216 declare void @llvm.arm.neon.vst3.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, i32)
1217 declare void @llvm.arm.neon.vst4.v16i8(i8*, <16 x i8>, <16 x i8>, <16 x i8>, <16 x i8>, i32)
1218 declare void @llvm.arm.neon.vst4.v8i16(i8*, <8 x i16>, <8 x i16>, <8 x i16>, <8 x i16>, i32)
1219 declare void @llvm.arm.neon.vst4.v4i32(i8*, <4 x i32>, <4 x i32>, <4 x i32>, <4 x i32>, i32)
1220 declare void @llvm.arm.neon.vst4.v2i64(i8*, <2 x i64>, <2 x i64>, <2 x i64>, <2 x i64>, i32)
1221 declare void @llvm.arm.neon.vst4.v4f32(i8*, <4 x float>, <4 x float>, <4 x float>, <4 x float>, i32)
1222 declare void @llvm.arm.neon.vst4.v2f64(i8*, <2 x double>, <2 x double>, <2 x double>, <2 x double>, i32)
1223 declare void @llvm.arm.neon.vst4.v8i8(i8*, <8 x i8>, <8 x i8>, <8 x i8>, <8 x i8>, i32)
1224 declare void @llvm.arm.neon.vst4.v4i16(i8*, <4 x i16>, <4 x i16>, <4 x i16>, <4 x i16>, i32)
1225 declare void @llvm.arm.neon.vst4.v2i32(i8*, <2 x i32>, <2 x i32>, <2 x i32>, <2 x i32>, i32)
1226 declare void @llvm.arm.neon.vst4.v1i64(i8*, <1 x i64>, <1 x i64>, <1 x i64>, <1 x i64>, i32)
1227 declare void @llvm.arm.neon.vst4.v2f32(i8*, <2 x float>, <2 x float>, <2 x float>, <2 x float>, i32)
1228 declare void @llvm.arm.neon.vst4.v1f64(i8*, <1 x double>, <1 x double>, <1 x double>, <1 x double>, i32)