drm_cache.c source code [linux/drivers/gpu/drm/drm_cache.c]

1	/**************************************************************************
2	*
3	* Copyright (c) 2006-2007 Tungsten Graphics, Inc., Cedar Park, TX., USA
4	* All Rights Reserved.
5	*
6	* Permission is hereby granted, free of charge, to any person obtaining a
7	* copy of this software and associated documentation files (the
8	* "Software"), to deal in the Software without restriction, including
9	* without limitation the rights to use, copy, modify, merge, publish,
10	* distribute, sub license, and/or sell copies of the Software, and to
11	* permit persons to whom the Software is furnished to do so, subject to
12	* the following conditions:
13	*
14	* The above copyright notice and this permission notice (including the
15	* next paragraph) shall be included in all copies or substantial portions
16	* of the Software.
17	*
18	* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19	* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20	* FITNESS FOR A PARTICULAR PURPOSE AND NON-INFRINGEMENT. IN NO EVENT SHALL
21	* THE COPYRIGHT HOLDERS, AUTHORS AND/OR ITS SUPPLIERS BE LIABLE FOR ANY CLAIM,
22	* DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR
23	* OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE
24	* USE OR OTHER DEALINGS IN THE SOFTWARE.
25	*
26	**************************************************************************/
27	/*
28	* Authors: Thomas Hellström <thomas-at-tungstengraphics-dot-com>
29	*/
30	#include <linux/cc_platform.h>
31	#include <linux/export.h>
32	#include <linux/highmem.h>
33	#include <linux/ioport.h>
34	#include <linux/iosys-map.h>
35	#include <xen/xen.h>
36
37	#include <drm/drm_cache.h>
38
39	/ A small bounce buffer that fits on the stack. /
40	#define MEMCPY_BOUNCE_SIZE 128
41
42	#if defined(CONFIG_X86)
43	#include <asm/smp.h>
44
45	/*
46	* clflushopt is an unordered instruction which needs fencing with mfence or
47	* sfence to avoid ordering issues. For drm_clflush_page this fencing happens
48	* in the caller.
49	*/
50	static void
51	drm_clflush_page(struct page *page)
52	{
53	uint8_t *page_virtual;
54	unsigned int i;
55	const int size = boot_cpu_data.x86_clflush_size;
56
57	if (unlikely(page == NULL))
58	return;
59
60	page_virtual = kmap_atomic(page);
61	for (i = `0`; i < PAGE_SIZE; i += size)
62	clflushopt(p: page_virtual + i);
63	kunmap_atomic(page_virtual);
64	}
65
66	static void drm_cache_flush_clflush(struct page *pages[],
67	unsigned long num_pages)
68	{
69	unsigned long i;
70
71	mb(); /Full memory barrier used before so that CLFLUSH is ordered/
72	for (i = `0`; i < num_pages; i++)
73	drm_clflush_page(page: *pages++);
74	mb(); /Also used after CLFLUSH so that all cache is flushed/
75	}
76	#endif
77
78	/**
79	* drm_clflush_pages - Flush dcache lines of a set of pages.
80	* @pages: List of pages to be flushed.
81	* @num_pages: Number of pages in the array.
82	*
83	* Flush every data cache line entry that points to an address belonging
84	* to a page in the array.
85	*/
86	void
87	drm_clflush_pages(struct page pages[], unsigned* long num_pages)
88	{
89
90	#if defined(CONFIG_X86)
91	if (static_cpu_has(X86_FEATURE_CLFLUSH)) {
92	drm_cache_flush_clflush(pages, num_pages);
93	return;
94	}
95
96	if (wbinvd_on_all_cpus())
97	pr_err("Timed out waiting for cache flush\n");
98
99	#elif defined(__powerpc__)
100	unsigned long i;
101
102	for (i = `0`; i < num_pages; i++) {
103	struct page *page = pages[i];
104	void *page_virtual;
105
106	if (unlikely(page == NULL))
107	continue;
108
109	page_virtual = kmap_atomic(page);
110	flush_dcache_range((unsigned long)page_virtual,
111	(unsigned long)page_virtual + PAGE_SIZE);
112	kunmap_atomic(page_virtual);
113	}
114	#else
115	WARN_ONCE(`1`, "Architecture has no drm_cache.c support\n");
116	#endif
117	}
118	EXPORT_SYMBOL(drm_clflush_pages);
119
120	/**
121	* drm_clflush_sg - Flush dcache lines pointing to a scather-gather.
122	* @st: struct sg_table.
123	*
124	* Flush every data cache line entry that points to an address in the
125	* sg.
126	*/
127	void
128	drm_clflush_sg(struct sg_table *st)
129	{
130	#if defined(CONFIG_X86)
131	if (static_cpu_has(X86_FEATURE_CLFLUSH)) {
132	struct sg_page_iter sg_iter;
133
134	mb(); /CLFLUSH is ordered only by using memory barriers/
135	for_each_sgtable_page(st, &sg_iter, `0`)
136	drm_clflush_page(page: sg_page_iter_page(piter: &sg_iter));
137	mb(); /Make sure that all cache line entry is flushed/
138
139	return;
140	}
141
142	if (wbinvd_on_all_cpus())
143	pr_err("Timed out waiting for cache flush\n");
144	#else
145	WARN_ONCE(`1`, "Architecture has no drm_cache.c support\n");
146	#endif
147	}
148	EXPORT_SYMBOL(drm_clflush_sg);
149
150	/**
151	* drm_clflush_virt_range - Flush dcache lines of a region
152	* @addr: Initial kernel memory address.
153	* @length: Region size.
154	*
155	* Flush every data cache line entry that points to an address in the
156	* region requested.
157	*/
158	void
159	drm_clflush_virt_range(void addr, unsigned* long length)
160	{
161	#if defined(CONFIG_X86)
162	if (static_cpu_has(X86_FEATURE_CLFLUSH)) {
163	const int size = boot_cpu_data.x86_clflush_size;
164	void *end = addr + length;
165
166	addr = (void )(((unsigned* long)addr) & -size);
167	mb(); /CLFLUSH is only ordered with a full memory barrier/
168	for (; addr < end; addr += size)
169	clflushopt(p: addr);
170	clflushopt(p: end - `1`); / force serialisation /
171	mb(); /Ensure that every data cache line entry is flushed/
172	return;
173	}
174
175	if (wbinvd_on_all_cpus())
176	pr_err("Timed out waiting for cache flush\n");
177	#else
178	WARN_ONCE(`1`, "Architecture has no drm_cache.c support\n");
179	#endif
180	}
181	EXPORT_SYMBOL(drm_clflush_virt_range);
182
183	bool drm_need_swiotlb(int dma_bits)
184	{
185	struct resource *tmp;
186	resource_size_t max_iomem = `0`;
187
188	/*
189	* Xen paravirtual hosts require swiotlb regardless of requested dma
190	* transfer size.
191	*
192	* NOTE: Really, what it requires is use of the dma_alloc_coherent
193	* allocator used in ttm_dma_populate() instead of
194	* ttm_populate_and_map_pages(), which bounce buffers so much in
195	* Xen it leads to swiotlb buffer exhaustion.
196	*/
197	if (xen_pv_domain())
198	return true;
199
200	/*
201	* Enforce dma_alloc_coherent when memory encryption is active as well
202	* for the same reasons as for Xen paravirtual hosts.
203	*/
204	if (cc_platform_has(attr: CC_ATTR_MEM_ENCRYPT))
205	return true;
206
207	for (tmp = iomem_resource.child; tmp; tmp = tmp->sibling)
208	max_iomem = max(max_iomem, tmp->end);
209
210	return max_iomem > ((u64)`1` << dma_bits);
211	}
212	EXPORT_SYMBOL(drm_need_swiotlb);
213
214	static void memcpy_fallback(struct iosys_map *dst,
215	const struct iosys_map *src,
216	unsigned long len)
217	{
218	if (!dst->is_iomem && !src->is_iomem) {
219	memcpy(dst->vaddr, src->vaddr, len);
220	} else if (!src->is_iomem) {
221	iosys_map_memcpy_to(dst, dst_offset: `0`, src: src->vaddr, len);
222	} else if (!dst->is_iomem) {
223	memcpy_fromio(dst->vaddr, src->vaddr_iomem, len);
224	} else {
225	/*
226	* Bounce size is not performance tuned, but using a
227	* bounce buffer like this is significantly faster than
228	* resorting to ioreadxx() + iowritexx().
229	*/
230	char bounce[MEMCPY_BOUNCE_SIZE];
231	void __iomem *_src = src->vaddr_iomem;
232	void __iomem *_dst = dst->vaddr_iomem;
233
234	while (len >= MEMCPY_BOUNCE_SIZE) {
235	memcpy_fromio(bounce, _src, MEMCPY_BOUNCE_SIZE);
236	memcpy_toio(_dst, bounce, MEMCPY_BOUNCE_SIZE);
237	_src += MEMCPY_BOUNCE_SIZE;
238	_dst += MEMCPY_BOUNCE_SIZE;
239	len -= MEMCPY_BOUNCE_SIZE;
240	}
241	if (len) {
242	memcpy_fromio(bounce, _src, MEMCPY_BOUNCE_SIZE);
243	memcpy_toio(_dst, bounce, MEMCPY_BOUNCE_SIZE);
244	}
245	}
246	}
247
248	#ifdef CONFIG_X86
249
250	static DEFINE_STATIC_KEY_FALSE(has_movntdqa);
251
252	static void __memcpy_ntdqa(void dst, const* void src, unsigned* long len)
253	{
254	kernel_fpu_begin();
255
256	while (len >= `4`) {
257	asm("movntdqa (%0), %%xmm0\n"
258	"movntdqa 16(%0), %%xmm1\n"
259	"movntdqa 32(%0), %%xmm2\n"
260	"movntdqa 48(%0), %%xmm3\n"
261	"movaps %%xmm0, (%1)\n"
262	"movaps %%xmm1, 16(%1)\n"
263	"movaps %%xmm2, 32(%1)\n"
264	"movaps %%xmm3, 48(%1)\n"
265	:: "r" (src), "r" (dst) : "memory");
266	src += `64`;
267	dst += `64`;
268	len -= `4`;
269	}
270	while (len--) {
271	asm("movntdqa (%0), %%xmm0\n"
272	"movaps %%xmm0, (%1)\n"
273	:: "r" (src), "r" (dst) : "memory");
274	src += `16`;
275	dst += `16`;
276	}
277
278	kernel_fpu_end();
279	}
280
281	/*
282	* __drm_memcpy_from_wc copies @len bytes from @src to @dst using
283	* non-temporal instructions where available. Note that all arguments
284	* (@src, @dst) must be aligned to 16 bytes and @len must be a multiple
285	* of 16.
286	*/
287	static void __drm_memcpy_from_wc(void dst, const* void src, unsigned* long len)
288	{
289	if (unlikely(((unsigned long)dst \| (unsigned long)src \| len) & `15`))
290	memcpy(dst, src, len);
291	else if (likely(len))
292	__memcpy_ntdqa(dst, src, len: len >> `4`);
293	}
294
295	/**
296	* drm_memcpy_from_wc - Perform the fastest available memcpy from a source
297	* that may be WC.
298	* @dst: The destination pointer
299	* @src: The source pointer
300	* @len: The size of the area o transfer in bytes
301	*
302	* Tries an arch optimized memcpy for prefetching reading out of a WC region,
303	* and if no such beast is available, falls back to a normal memcpy.
304	*/
305	void drm_memcpy_from_wc(struct iosys_map *dst,
306	const struct iosys_map *src,
307	unsigned long len)
308	{
309	if (WARN_ON(in_interrupt())) {
310	memcpy_fallback(dst, src, len);
311	return;
312	}
313
314	if (static_branch_likely(&has_movntdqa)) {
315	__drm_memcpy_from_wc(dst: dst->is_iomem ?
316	(void __force *)dst->vaddr_iomem :
317	dst->vaddr,
318	src: src->is_iomem ?
319	(void const __force *)src->vaddr_iomem :
320	src->vaddr,
321	len);
322	return;
323	}
324
325	memcpy_fallback(dst, src, len);
326	}
327	EXPORT_SYMBOL(drm_memcpy_from_wc);
328
329	/*
330	* drm_memcpy_init_early - One time initialization of the WC memcpy code
331	*/
332	void drm_memcpy_init_early(void)
333	{
334	/*
335	* Some hypervisors (e.g. KVM) don't support VEX-prefix instructions
336	* emulation. So don't enable movntdqa in hypervisor guest.
337	*/
338	if (static_cpu_has(X86_FEATURE_XMM4_1) &&
339	!boot_cpu_has(X86_FEATURE_HYPERVISOR))
340	static_branch_enable(&has_movntdqa);
341	}
342	#else
343	void drm_memcpy_from_wc(struct iosys_map *dst,
344	const struct iosys_map *src,
345	unsigned long len)
346	{
347	WARN_ON(in_interrupt());
348
349	memcpy_fallback(dst, src, len);
350	}
351	EXPORT_SYMBOL(drm_memcpy_from_wc);
352
353	void drm_memcpy_init_early(void)
354	{
355	}
356	#endif /* CONFIG_X86 */
357

source code of linux/drivers/gpu/drm/drm_cache.c