1 | /* fp16i.c |
2 | * |
3 | * Copyright 2021 Red Hat, Inc. |
4 | * |
5 | * This library is free software; you can redistribute it and/or |
6 | * modify it under the terms of the GNU Lesser General Public |
7 | * License as published by the Free Software Foundation; either |
8 | * version 2.1 of the License, or (at your option) any later version. |
9 | * |
10 | * This library is distributed in the hope that it will be useful, |
11 | * but WITHOUT ANY WARRANTY; without even the implied warranty of |
12 | * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
13 | * Lesser General Public License for more details. |
14 | * |
15 | * You should have received a copy of the GNU Lesser General Public |
16 | * License along with this program. If not, see <http://www.gnu.org/licenses/>. |
17 | * |
18 | * SPDX-License-Identifier: LGPL-2.1-or-later |
19 | */ |
20 | |
21 | #include "config.h" |
22 | |
23 | #include "fp16private.h" |
24 | |
25 | #ifdef HAVE_F16C |
26 | #include <immintrin.h> |
27 | |
28 | #if defined(_MSC_VER) && !defined(__clang__) |
29 | #define CAST_M128I_P(a) (__m128i const *) a |
30 | #else |
31 | #define CAST_M128I_P(a) (__m128i_u const *) a |
32 | #endif |
33 | void |
34 | float_to_half4_f16c (const float f[4], |
35 | guint16 h[4]) |
36 | { |
37 | __m128 s = _mm_loadu_ps (f); |
38 | __m128i i = _mm_cvtps_ph (s, 0); |
39 | _mm_storel_epi64 ((__m128i*)h, i); |
40 | } |
41 | |
42 | void |
43 | half_to_float4_f16c (const guint16 h[4], |
44 | float f[4]) |
45 | { |
46 | __m128i i = _mm_loadl_epi64 (CAST_M128I_P (h)); |
47 | __m128 s = _mm_cvtph_ps (i); |
48 | |
49 | _mm_store_ps (f, s); |
50 | } |
51 | |
52 | #define ALIGNED(p, n) (GPOINTER_TO_UINT(p) % n == 0) |
53 | void |
54 | float_to_half_f16c (const float *f, |
55 | guint16 *h, |
56 | int n) |
57 | { |
58 | __m128 s; |
59 | __m128i i; |
60 | int j; |
61 | const float *ff = f; |
62 | guint16 *hh = h; |
63 | |
64 | for (j = 0; j < n; j++) |
65 | { |
66 | if (ALIGNED (ff, 16) && ALIGNED (hh, 16)) |
67 | break; |
68 | ff++; |
69 | hh++; |
70 | } |
71 | |
72 | float_to_half_c (f, h, j); |
73 | |
74 | for (; j + 4 < n; j += 4) |
75 | { |
76 | s = _mm_loadu_ps (ff); |
77 | i = _mm_cvtps_ph (s, 0); |
78 | _mm_storel_epi64 ((__m128i*)hh, i); |
79 | ff += 4; |
80 | hh += 4; |
81 | } |
82 | |
83 | if (j < n) |
84 | float_to_half_c (ff, hh, n - j); |
85 | } |
86 | |
87 | void |
88 | half_to_float_f16c (const guint16 *h, |
89 | float *f, |
90 | int n) |
91 | { |
92 | __m128i i; |
93 | __m128 s; |
94 | int j; |
95 | const guint16 *hh = h; |
96 | float *ff = f; |
97 | |
98 | for (j = 0; j < n; j++) |
99 | { |
100 | if (ALIGNED (ff, 16) && ALIGNED (hh, 16)) |
101 | break; |
102 | ff++; |
103 | hh++; |
104 | } |
105 | |
106 | half_to_float_c (h, f, j); |
107 | |
108 | for (; j + 4 < n; j += 4) |
109 | { |
110 | i = _mm_loadl_epi64 (CAST_M128I_P (hh)); |
111 | s = _mm_cvtph_ps (i); |
112 | _mm_store_ps (ff, s); |
113 | hh += 4; |
114 | ff += 4; |
115 | } |
116 | |
117 | if (j < n) |
118 | half_to_float_c (hh, ff, n - j); |
119 | } |
120 | |
121 | #endif /* HAVE_F16C */ |
122 | |
123 | |