intel_ggtt_fencing.c source code [linux/drivers/gpu/drm/i915/gt/intel_ggtt_fencing.c]

1	// SPDX-License-Identifier: MIT
2	/*
3	* Copyright © 2008-2015 Intel Corporation
4	*/
5
6	#include <linux/highmem.h>
7
8	#include "display/intel_display.h"
9	#include "i915_drv.h"
10	#include "i915_reg.h"
11	#include "i915_scatterlist.h"
12	#include "i915_pvinfo.h"
13	#include "i915_vgpu.h"
14	#include "intel_gt_regs.h"
15	#include "intel_mchbar_regs.h"
16
17	/**
18	* DOC: fence register handling
19	*
20	* Important to avoid confusions: "fences" in the i915 driver are not execution
21	* fences used to track command completion but hardware detiler objects which
22	* wrap a given range of the global GTT. Each platform has only a fairly limited
23	* set of these objects.
24	*
25	* Fences are used to detile GTT memory mappings. They're also connected to the
26	* hardware frontbuffer render tracking and hence interact with frontbuffer
27	* compression. Furthermore on older platforms fences are required for tiled
28	* objects used by the display engine. They can also be used by the render
29	* engine - they're required for blitter commands and are optional for render
30	* commands. But on gen4+ both display (with the exception of fbc) and rendering
31	* have their own tiling state bits and don't need fences.
32	*
33	* Also note that fences only support X and Y tiling and hence can't be used for
34	* the fancier new tiling formats like W, Ys and Yf.
35	*
36	* Finally note that because fences are such a restricted resource they're
37	* dynamically associated with objects. Furthermore fence state is committed to
38	* the hardware lazily to avoid unnecessary stalls on gen2/3. Therefore code must
39	* explicitly call i915_gem_object_get_fence() to synchronize fencing status
40	* for cpu access. Also note that some code wants an unfenced view, for those
41	* cases the fence can be removed forcefully with i915_gem_object_put_fence().
42	*
43	* Internally these functions will synchronize with userspace access by removing
44	* CPU ptes into GTT mmaps (not the GTT ptes themselves) as needed.
45	*/
46
47	#define pipelined 0
48
49	static struct drm_i915_private fence_to_i915(struct* i915_fence_reg *fence)
50	{
51	return fence->ggtt->vm.i915;
52	}
53
54	static struct intel_uncore fence_to_uncore(struct* i915_fence_reg *fence)
55	{
56	return fence->ggtt->vm.gt->uncore;
57	}
58
59	static void i965_write_fence_reg(struct i915_fence_reg *fence)
60	{
61	i915_reg_t fence_reg_lo, fence_reg_hi;
62	int fence_pitch_shift;
63	u64 val;
64
65	if (GRAPHICS_VER(fence_to_i915(fence)) >= `6`) {
66	fence_reg_lo = FENCE_REG_GEN6_LO(fence->id);
67	fence_reg_hi = FENCE_REG_GEN6_HI(fence->id);
68	fence_pitch_shift = GEN6_FENCE_PITCH_SHIFT;
69
70	} else {
71	fence_reg_lo = FENCE_REG_965_LO(fence->id);
72	fence_reg_hi = FENCE_REG_965_HI(fence->id);
73	fence_pitch_shift = I965_FENCE_PITCH_SHIFT;
74	}
75
76	val = `0`;
77	if (fence->tiling) {
78	unsigned int stride = fence->stride;
79
80	GEM_BUG_ON(!IS_ALIGNED(stride, `128`));
81
82	val = fence->start + fence->size - I965_FENCE_PAGE;
83	val <<= `32`;
84	val \|= fence->start;
85	val \|= (u64)((stride / `128`) - `1`) << fence_pitch_shift;
86	if (fence->tiling == I915_TILING_Y)
87	val \|= BIT(I965_FENCE_TILING_Y_SHIFT);
88	val \|= I965_FENCE_REG_VALID;
89	}
90
91	if (!pipelined) {
92	struct intel_uncore *uncore = fence_to_uncore(fence);
93
94	/*
95	* To w/a incoherency with non-atomic 64-bit register updates,
96	* we split the 64-bit update into two 32-bit writes. In order
97	* for a partial fence not to be evaluated between writes, we
98	* precede the update with write to turn off the fence register,
99	* and only enable the fence as the last step.
100	*
101	* For extra levels of paranoia, we make sure each step lands
102	* before applying the next step.
103	*/
104	intel_uncore_write_fw(uncore, fence_reg_lo, `0`);
105	intel_uncore_posting_read_fw(uncore, fence_reg_lo);
106
107	intel_uncore_write_fw(uncore, fence_reg_hi, upper_32_bits(val));
108	intel_uncore_write_fw(uncore, fence_reg_lo, lower_32_bits(val));
109	intel_uncore_posting_read_fw(uncore, fence_reg_lo);
110	}
111	}
112
113	static void i915_write_fence_reg(struct i915_fence_reg *fence)
114	{
115	u32 val;
116
117	val = `0`;
118	if (fence->tiling) {
119	unsigned int stride = fence->stride;
120	unsigned int tiling = fence->tiling;
121	bool is_y_tiled = tiling == I915_TILING_Y;
122
123	if (is_y_tiled && HAS_128_BYTE_Y_TILING(fence_to_i915(fence)))
124	stride /= `128`;
125	else
126	stride /= `512`;
127	GEM_BUG_ON(!is_power_of_2(stride));
128
129	val = fence->start;
130	if (is_y_tiled)
131	val \|= BIT(I830_FENCE_TILING_Y_SHIFT);
132	val \|= I915_FENCE_SIZE_BITS(fence->size);
133	val \|= ilog2(stride) << I830_FENCE_PITCH_SHIFT;
134
135	val \|= I830_FENCE_REG_VALID;
136	}
137
138	if (!pipelined) {
139	struct intel_uncore *uncore = fence_to_uncore(fence);
140	i915_reg_t reg = FENCE_REG(fence->id);
141
142	intel_uncore_write_fw(uncore, reg, val);
143	intel_uncore_posting_read_fw(uncore, reg);
144	}
145	}
146
147	static void i830_write_fence_reg(struct i915_fence_reg *fence)
148	{
149	u32 val;
150
151	val = `0`;
152	if (fence->tiling) {
153	unsigned int stride = fence->stride;
154
155	val = fence->start;
156	if (fence->tiling == I915_TILING_Y)
157	val \|= BIT(I830_FENCE_TILING_Y_SHIFT);
158	val \|= I830_FENCE_SIZE_BITS(fence->size);
159	val \|= ilog2(stride / `128`) << I830_FENCE_PITCH_SHIFT;
160	val \|= I830_FENCE_REG_VALID;
161	}
162
163	if (!pipelined) {
164	struct intel_uncore *uncore = fence_to_uncore(fence);
165	i915_reg_t reg = FENCE_REG(fence->id);
166
167	intel_uncore_write_fw(uncore, reg, val);
168	intel_uncore_posting_read_fw(uncore, reg);
169	}
170	}
171
172	static void fence_write(struct i915_fence_reg *fence)
173	{
174	struct drm_i915_private *i915 = fence_to_i915(fence);
175
176	/*
177	* Previous access through the fence register is marshalled by
178	* the mb() inside the fault handlers (i915_gem_release_mmaps)
179	* and explicitly managed for internal users.
180	*/
181
182	if (GRAPHICS_VER(i915) == `2`)
183	i830_write_fence_reg(fence);
184	else if (GRAPHICS_VER(i915) == `3`)
185	i915_write_fence_reg(fence);
186	else
187	i965_write_fence_reg(fence);
188
189	/*
190	* Access through the fenced region afterwards is
191	* ordered by the posting reads whilst writing the registers.
192	*/
193	}
194
195	static bool gpu_uses_fence_registers(struct i915_fence_reg *fence)
196	{
197	return GRAPHICS_VER(fence_to_i915(fence)) < `4`;
198	}
199
200	static int fence_update(struct i915_fence_reg *fence,
201	struct i915_vma *vma)
202	{
203	struct i915_ggtt *ggtt = fence->ggtt;
204	struct intel_uncore *uncore = fence_to_uncore(fence);
205	intel_wakeref_t wakeref;
206	struct i915_vma *old;
207	int ret;
208
209	fence->tiling = `0`;
210	if (vma) {
211	GEM_BUG_ON(!i915_gem_object_get_stride(vma->obj) \|\|
212	!i915_gem_object_get_tiling(vma->obj));
213
214	if (!i915_vma_is_map_and_fenceable(vma))
215	return -EINVAL;
216
217	if (gpu_uses_fence_registers(fence)) {
218	/ implicit 'unfenced' GPU blits /
219	ret = i915_vma_sync(vma);
220	if (ret)
221	return ret;
222	}
223
224	GEM_BUG_ON(vma->fence_size > i915_vma_size(vma));
225	fence->start = i915_ggtt_offset(vma);
226	fence->size = vma->fence_size;
227	fence->stride = i915_gem_object_get_stride(obj: vma->obj);
228	fence->tiling = i915_gem_object_get_tiling(obj: vma->obj);
229	}
230	WRITE_ONCE(fence->dirty, false);
231
232	old = xchg(&fence->vma, NULL);
233	if (old) {
234	/ XXX Ideally we would move the waiting to outside the mutex /
235	ret = i915_active_wait(ref: &fence->active);
236	if (ret) {
237	fence->vma = old;
238	return ret;
239	}
240
241	i915_vma_flush_writes(vma: old);
242
243	/*
244	* Ensure that all userspace CPU access is completed before
245	* stealing the fence.
246	*/
247	if (old != vma) {
248	GEM_BUG_ON(old->fence != fence);
249	i915_vma_revoke_mmap(vma: old);
250	old->fence = NULL;
251	}
252
253	list_move(list: &fence->link, head: &ggtt->fence_list);
254	}
255
256	/*
257	* We only need to update the register itself if the device is awake.
258	* If the device is currently powered down, we will defer the write
259	* to the runtime resume, see intel_ggtt_restore_fences().
260	*
261	* This only works for removing the fence register, on acquisition
262	* the caller must hold the rpm wakeref. The fence register must
263	* be cleared before we can use any other fences to ensure that
264	* the new fences do not overlap the elided clears, confusing HW.
265	*/
266	wakeref = intel_runtime_pm_get_if_in_use(rpm: uncore->rpm);
267	if (!wakeref) {
268	GEM_BUG_ON(vma);
269	return `0`;
270	}
271
272	WRITE_ONCE(fence->vma, vma);
273	fence_write(fence);
274
275	if (vma) {
276	vma->fence = fence;
277	list_move_tail(list: &fence->link, head: &ggtt->fence_list);
278	}
279
280	intel_runtime_pm_put(rpm: uncore->rpm, wref: wakeref);
281	return `0`;
282	}
283
284	/**
285	* i915_vma_revoke_fence - force-remove fence for a VMA
286	* @vma: vma to map linearly (not through a fence reg)
287	*
288	* This function force-removes any fence from the given object, which is useful
289	* if the kernel wants to do untiled GTT access.
290	*/
291	void i915_vma_revoke_fence(struct i915_vma *vma)
292	{
293	struct i915_fence_reg *fence = vma->fence;
294	intel_wakeref_t wakeref;
295
296	lockdep_assert_held(&vma->vm->mutex);
297	if (!fence)
298	return;
299
300	GEM_BUG_ON(fence->vma != vma);
301	GEM_BUG_ON(!i915_active_is_idle(&fence->active));
302	GEM_BUG_ON(atomic_read(&fence->pin_count));
303
304	fence->tiling = `0`;
305	WRITE_ONCE(fence->vma, NULL);
306	vma->fence = NULL;
307
308	/*
309	* Skip the write to HW if and only if the device is currently
310	* suspended.
311	*
312	* If the driver does not currently hold a wakeref (if_in_use == 0),
313	* the device may currently be runtime suspended, or it may be woken
314	* up before the suspend takes place. If the device is not suspended
315	* (powered down) and we skip clearing the fence register, the HW is
316	* left in an undefined state where we may end up with multiple
317	* registers overlapping.
318	*/
319	with_intel_runtime_pm_if_active(fence_to_uncore(fence)->rpm, wakeref)
320	fence_write(fence);
321	}
322
323	static bool fence_is_active(const struct i915_fence_reg *fence)
324	{
325	return fence->vma && i915_vma_is_active(vma: fence->vma);
326	}
327
328	static struct i915_fence_reg fence_find(struct* i915_ggtt *ggtt)
329	{
330	struct i915_fence_reg *active = NULL;
331	struct i915_fence_reg fence, fn;
332
333	list_for_each_entry_safe(fence, fn, &ggtt->fence_list, link) {
334	GEM_BUG_ON(fence->vma && fence->vma->fence != fence);
335
336	if (fence == active) / now seen this fence twice /
337	active = ERR_PTR(error: -EAGAIN);
338
339	/ Prefer idle fences so we do not have to wait on the GPU /
340	if (active != ERR_PTR(error: -EAGAIN) && fence_is_active(fence)) {
341	if (!active)
342	active = fence;
343
344	list_move_tail(list: &fence->link, head: &ggtt->fence_list);
345	continue;
346	}
347
348	if (atomic_read(v: &fence->pin_count))
349	continue;
350
351	return fence;
352	}
353
354	/ Wait for completion of pending flips which consume fences /
355	if (intel_has_pending_fb_unpin(dev_priv: ggtt->vm.i915))
356	return ERR_PTR(error: -EAGAIN);
357
358	return ERR_PTR(error: -ENOBUFS);
359	}
360
361	int __i915_vma_pin_fence(struct i915_vma *vma)
362	{
363	struct i915_ggtt *ggtt = i915_vm_to_ggtt(vm: vma->vm);
364	struct i915_fence_reg *fence;
365	struct i915_vma *set = i915_gem_object_is_tiled(obj: vma->obj) ? vma : NULL;
366	int err;
367
368	lockdep_assert_held(&vma->vm->mutex);
369
370	/ Just update our place in the LRU if our fence is getting reused. /
371	if (vma->fence) {
372	fence = vma->fence;
373	GEM_BUG_ON(fence->vma != vma);
374	atomic_inc(v: &fence->pin_count);
375	if (!fence->dirty) {
376	list_move_tail(list: &fence->link, head: &ggtt->fence_list);
377	return `0`;
378	}
379	} else if (set) {
380	fence = fence_find(ggtt);
381	if (IS_ERR(ptr: fence))
382	return PTR_ERR(ptr: fence);
383
384	GEM_BUG_ON(atomic_read(&fence->pin_count));
385	atomic_inc(v: &fence->pin_count);
386	} else {
387	return `0`;
388	}
389
390	err = fence_update(fence, vma: set);
391	if (err)
392	goto out_unpin;
393
394	GEM_BUG_ON(fence->vma != set);
395	GEM_BUG_ON(vma->fence != (set ? fence : NULL));
396
397	if (set)
398	return `0`;
399
400	out_unpin:
401	atomic_dec(v: &fence->pin_count);
402	return err;
403	}
404
405	/**
406	* i915_vma_pin_fence - set up fencing for a vma
407	* @vma: vma to map through a fence reg
408	*
409	* When mapping objects through the GTT, userspace wants to be able to write
410	* to them without having to worry about swizzling if the object is tiled.
411	* This function walks the fence regs looking for a free one for @obj,
412	* stealing one if it can't find any.
413	*
414	* It then sets up the reg based on the object's properties: address, pitch
415	* and tiling format.
416	*
417	* For an untiled surface, this removes any existing fence.
418	*
419	* Returns:
420	*
421	* 0 on success, negative error code on failure.
422	*/
423	int i915_vma_pin_fence(struct i915_vma *vma)
424	{
425	int err;
426
427	if (!vma->fence && !i915_gem_object_is_tiled(obj: vma->obj))
428	return `0`;
429
430	/*
431	* Note that we revoke fences on runtime suspend. Therefore the user
432	* must keep the device awake whilst using the fence.
433	*/
434	assert_rpm_wakelock_held(rpm: vma->vm->gt->uncore->rpm);
435	GEM_BUG_ON(!i915_vma_is_ggtt(vma));
436
437	err = mutex_lock_interruptible(&vma->vm->mutex);
438	if (err)
439	return err;
440
441	err = __i915_vma_pin_fence(vma);
442	mutex_unlock(lock: &vma->vm->mutex);
443
444	return err;
445	}
446
447	/**
448	* i915_reserve_fence - Reserve a fence for vGPU
449	* @ggtt: Global GTT
450	*
451	* This function walks the fence regs looking for a free one and remove
452	* it from the fence_list. It is used to reserve fence for vGPU to use.
453	*/
454	struct i915_fence_reg i915_reserve_fence(struct* i915_ggtt *ggtt)
455	{
456	struct i915_fence_reg *fence;
457	int count;
458	int ret;
459
460	lockdep_assert_held(&ggtt->vm.mutex);
461
462	/ Keep at least one fence available for the display engine. /
463	count = `0`;
464	list_for_each_entry(fence, &ggtt->fence_list, link)
465	count += !atomic_read(v: &fence->pin_count);
466	if (count <= `1`)
467	return ERR_PTR(error: -ENOSPC);
468
469	fence = fence_find(ggtt);
470	if (IS_ERR(ptr: fence))
471	return fence;
472
473	if (fence->vma) {
474	/ Force-remove fence from VMA /
475	ret = fence_update(fence, NULL);
476	if (ret)
477	return ERR_PTR(error: ret);
478	}
479
480	list_del(entry: &fence->link);
481
482	return fence;
483	}
484
485	/**
486	* i915_unreserve_fence - Reclaim a reserved fence
487	* @fence: the fence reg
488	*
489	* This function add a reserved fence register from vGPU to the fence_list.
490	*/
491	void i915_unreserve_fence(struct i915_fence_reg *fence)
492	{
493	struct i915_ggtt *ggtt = fence->ggtt;
494
495	lockdep_assert_held(&ggtt->vm.mutex);
496
497	list_add(new: &fence->link, head: &ggtt->fence_list);
498	}
499
500	/**
501	* intel_ggtt_restore_fences - restore fence state
502	* @ggtt: Global GTT
503	*
504	* Restore the hw fence state to match the software tracking again, to be called
505	* after a gpu reset and on resume. Note that on runtime suspend we only cancel
506	* the fences, to be reacquired by the user later.
507	*/
508	void intel_ggtt_restore_fences(struct i915_ggtt *ggtt)
509	{
510	int i;
511
512	for (i = `0`; i < ggtt->num_fences; i++)
513	fence_write(fence: &ggtt->fence_regs[i]);
514	}
515
516	/**
517	* DOC: tiling swizzling details
518	*
519	* The idea behind tiling is to increase cache hit rates by rearranging
520	* pixel data so that a group of pixel accesses are in the same cacheline.
521	* Performance improvement from doing this on the back/depth buffer are on
522	* the order of 30%.
523	*
524	* Intel architectures make this somewhat more complicated, though, by
525	* adjustments made to addressing of data when the memory is in interleaved
526	* mode (matched pairs of DIMMS) to improve memory bandwidth.
527	* For interleaved memory, the CPU sends every sequential 64 bytes
528	* to an alternate memory channel so it can get the bandwidth from both.
529	*
530	* The GPU also rearranges its accesses for increased bandwidth to interleaved
531	* memory, and it matches what the CPU does for non-tiled. However, when tiled
532	* it does it a little differently, since one walks addresses not just in the
533	* X direction but also Y. So, along with alternating channels when bit
534	* 6 of the address flips, it also alternates when other bits flip -- Bits 9
535	* (every 512 bytes, an X tile scanline) and 10 (every two X tile scanlines)
536	* are common to both the 915 and 965-class hardware.
537	*
538	* The CPU also sometimes XORs in higher bits as well, to improve
539	* bandwidth doing strided access like we do so frequently in graphics. This
540	* is called "Channel XOR Randomization" in the MCH documentation. The result
541	* is that the CPU is XORing in either bit 11 or bit 17 to bit 6 of its address
542	* decode.
543	*
544	* All of this bit 6 XORing has an effect on our memory management,
545	* as we need to make sure that the 3d driver can correctly address object
546	* contents.
547	*
548	* If we don't have interleaved memory, all tiling is safe and no swizzling is
549	* required.
550	*
551	* When bit 17 is XORed in, we simply refuse to tile at all. Bit
552	* 17 is not just a page offset, so as we page an object out and back in,
553	* individual pages in it will have different bit 17 addresses, resulting in
554	* each 64 bytes being swapped with its neighbor!
555	*
556	* Otherwise, if interleaved, we have to tell the 3d driver what the address
557	* swizzling it needs to do is, since it's writing with the CPU to the pages
558	* (bit 6 and potentially bit 11 XORed in), and the GPU is reading from the
559	* pages (bit 6, 9, and 10 XORed in), resulting in a cumulative bit swizzling
560	* required by the CPU of XORing in bit 6, 9, 10, and potentially 11, in order
561	* to match what the GPU expects.
562	*/
563
564	/**
565	* detect_bit_6_swizzle - detect bit 6 swizzling pattern
566	* @ggtt: Global GGTT
567	*
568	* Detects bit 6 swizzling of address lookup between IGD access and CPU
569	* access through main memory.
570	*/
571	static void detect_bit_6_swizzle(struct i915_ggtt *ggtt)
572	{
573	struct intel_uncore *uncore = ggtt->vm.gt->uncore;
574	struct drm_i915_private *i915 = ggtt->vm.i915;
575	u32 swizzle_x = I915_BIT_6_SWIZZLE_UNKNOWN;
576	u32 swizzle_y = I915_BIT_6_SWIZZLE_UNKNOWN;
577
578	if (GRAPHICS_VER(i915) >= `8` \|\| IS_VALLEYVIEW(i915)) {
579	/*
580	* On BDW+, swizzling is not used. We leave the CPU memory
581	* controller in charge of optimizing memory accesses without
582	* the extra address manipulation GPU side.
583	*
584	* VLV and CHV don't have GPU swizzling.
585	*/
586	swizzle_x = I915_BIT_6_SWIZZLE_NONE;
587	swizzle_y = I915_BIT_6_SWIZZLE_NONE;
588	} else if (GRAPHICS_VER(i915) >= `6`) {
589	if (i915->preserve_bios_swizzle) {
590	if (intel_uncore_read(uncore, DISP_ARB_CTL) &
591	DISP_TILE_SURFACE_SWIZZLING) {
592	swizzle_x = I915_BIT_6_SWIZZLE_9_10;
593	swizzle_y = I915_BIT_6_SWIZZLE_9;
594	} else {
595	swizzle_x = I915_BIT_6_SWIZZLE_NONE;
596	swizzle_y = I915_BIT_6_SWIZZLE_NONE;
597	}
598	} else {
599	u32 dimm_c0, dimm_c1;
600
601	dimm_c0 = intel_uncore_read(uncore, MAD_DIMM_C0);
602	dimm_c1 = intel_uncore_read(uncore, MAD_DIMM_C1);
603	dimm_c0 &= MAD_DIMM_A_SIZE_MASK \| MAD_DIMM_B_SIZE_MASK;
604	dimm_c1 &= MAD_DIMM_A_SIZE_MASK \| MAD_DIMM_B_SIZE_MASK;
605	/*
606	* Enable swizzling when the channels are populated
607	* with identically sized dimms. We don't need to check
608	* the 3rd channel because no cpu with gpu attached
609	* ships in that configuration. Also, swizzling only
610	* makes sense for 2 channels anyway.
611	*/
612	if (dimm_c0 == dimm_c1) {
613	swizzle_x = I915_BIT_6_SWIZZLE_9_10;
614	swizzle_y = I915_BIT_6_SWIZZLE_9;
615	} else {
616	swizzle_x = I915_BIT_6_SWIZZLE_NONE;
617	swizzle_y = I915_BIT_6_SWIZZLE_NONE;
618	}
619	}
620	} else if (GRAPHICS_VER(i915) == `5`) {
621	/*
622	* On Ironlake whatever DRAM config, GPU always do
623	* same swizzling setup.
624	*/
625	swizzle_x = I915_BIT_6_SWIZZLE_9_10;
626	swizzle_y = I915_BIT_6_SWIZZLE_9;
627	} else if (GRAPHICS_VER(i915) == `2`) {
628	/*
629	* As far as we know, the 865 doesn't have these bit 6
630	* swizzling issues.
631	*/
632	swizzle_x = I915_BIT_6_SWIZZLE_NONE;
633	swizzle_y = I915_BIT_6_SWIZZLE_NONE;
634	} else if (IS_G45(i915) \|\| IS_I965G(i915) \|\| IS_G33(i915)) {
635	/*
636	* The 965, G33, and newer, have a very flexible memory
637	* configuration. It will enable dual-channel mode
638	* (interleaving) on as much memory as it can, and the GPU
639	* will additionally sometimes enable different bit 6
640	* swizzling for tiled objects from the CPU.
641	*
642	* Here's what I found on the G965:
643	* slot fill memory size swizzling
644	* 0A 0B 1A 1B 1-ch 2-ch
645	* 512 0 0 0 512 0 O
646	* 512 0 512 0 16 1008 X
647	* 512 0 0 512 16 1008 X
648	* 0 512 0 512 16 1008 X
649	* 1024 1024 1024 0 2048 1024 O
650	*
651	* We could probably detect this based on either the DRB
652	* matching, which was the case for the swizzling required in
653	* the table above, or from the 1-ch value being less than
654	* the minimum size of a rank.
655	*
656	* Reports indicate that the swizzling actually
657	* varies depending upon page placement inside the
658	* channels, i.e. we see swizzled pages where the
659	* banks of memory are paired and unswizzled on the
660	* uneven portion, so leave that as unknown.
661	*/
662	if (intel_uncore_read16(uncore, C0DRB3_BW) ==
663	intel_uncore_read16(uncore, C1DRB3_BW)) {
664	swizzle_x = I915_BIT_6_SWIZZLE_9_10;
665	swizzle_y = I915_BIT_6_SWIZZLE_9;
666	}
667	} else {
668	u32 dcc = intel_uncore_read(uncore, DCC);
669
670	/*
671	* On 9xx chipsets, channel interleave by the CPU is
672	* determined by DCC. For single-channel, neither the CPU
673	* nor the GPU do swizzling. For dual channel interleaved,
674	* the GPU's interleave is bit 9 and 10 for X tiled, and bit
675	* 9 for Y tiled. The CPU's interleave is independent, and
676	* can be based on either bit 11 (haven't seen this yet) or
677	* bit 17 (common).
678	*/
679	switch (dcc & DCC_ADDRESSING_MODE_MASK) {
680	case DCC_ADDRESSING_MODE_SINGLE_CHANNEL:
681	case DCC_ADDRESSING_MODE_DUAL_CHANNEL_ASYMMETRIC:
682	swizzle_x = I915_BIT_6_SWIZZLE_NONE;
683	swizzle_y = I915_BIT_6_SWIZZLE_NONE;
684	break;
685	case DCC_ADDRESSING_MODE_DUAL_CHANNEL_INTERLEAVED:
686	if (dcc & DCC_CHANNEL_XOR_DISABLE) {
687	/*
688	* This is the base swizzling by the GPU for
689	* tiled buffers.
690	*/
691	swizzle_x = I915_BIT_6_SWIZZLE_9_10;
692	swizzle_y = I915_BIT_6_SWIZZLE_9;
693	} else if ((dcc & DCC_CHANNEL_XOR_BIT_17) == `0`) {
694	/ Bit 11 swizzling by the CPU in addition. /
695	swizzle_x = I915_BIT_6_SWIZZLE_9_10_11;
696	swizzle_y = I915_BIT_6_SWIZZLE_9_11;
697	} else {
698	/ Bit 17 swizzling by the CPU in addition. /
699	swizzle_x = I915_BIT_6_SWIZZLE_9_10_17;
700	swizzle_y = I915_BIT_6_SWIZZLE_9_17;
701	}
702	break;
703	}
704
705	/ check for L-shaped memory aka modified enhanced addressing /
706	if (GRAPHICS_VER(i915) == `4` &&
707	!(intel_uncore_read(uncore, DCC2) & DCC2_MODIFIED_ENHANCED_DISABLE)) {
708	swizzle_x = I915_BIT_6_SWIZZLE_UNKNOWN;
709	swizzle_y = I915_BIT_6_SWIZZLE_UNKNOWN;
710	}
711
712	if (dcc == `0xffffffff`) {
713	drm_err(&i915->drm, "Couldn't read from MCHBAR. "
714	"Disabling tiling.\n");
715	swizzle_x = I915_BIT_6_SWIZZLE_UNKNOWN;
716	swizzle_y = I915_BIT_6_SWIZZLE_UNKNOWN;
717	}
718	}
719
720	if (swizzle_x == I915_BIT_6_SWIZZLE_UNKNOWN \|\|
721	swizzle_y == I915_BIT_6_SWIZZLE_UNKNOWN) {
722	/*
723	* Userspace likes to explode if it sees unknown swizzling,
724	* so lie. We will finish the lie when reporting through
725	* the get-tiling-ioctl by reporting the physical swizzle
726	* mode as unknown instead.
727	*
728	* As we don't strictly know what the swizzling is, it may be
729	* bit17 dependent, and so we need to also prevent the pages
730	* from being moved.
731	*/
732	i915->gem_quirks \|= GEM_QUIRK_PIN_SWIZZLED_PAGES;
733	swizzle_x = I915_BIT_6_SWIZZLE_NONE;
734	swizzle_y = I915_BIT_6_SWIZZLE_NONE;
735	}
736
737	to_gt(i915)->ggtt->bit_6_swizzle_x = swizzle_x;
738	to_gt(i915)->ggtt->bit_6_swizzle_y = swizzle_y;
739	}
740
741	/*
742	* Swap every 64 bytes of this page around, to account for it having a new
743	* bit 17 of its physical address and therefore being interpreted differently
744	* by the GPU.
745	*/
746	static void swizzle_page(struct page *page)
747	{
748	char temp[`64`];
749	char *vaddr;
750	int i;
751
752	vaddr = kmap(page);
753
754	for (i = `0`; i < PAGE_SIZE; i += `128`) {
755	memcpy(temp, &vaddr[i], `64`);
756	memcpy(&vaddr[i], &vaddr[i + `64`], `64`);
757	memcpy(&vaddr[i + `64`], temp, `64`);
758	}
759
760	kunmap(page);
761	}
762
763	/**
764	* i915_gem_object_do_bit_17_swizzle - fixup bit 17 swizzling
765	* @obj: i915 GEM buffer object
766	* @pages: the scattergather list of physical pages
767	*
768	* This function fixes up the swizzling in case any page frame number for this
769	* object has changed in bit 17 since that state has been saved with
770	* i915_gem_object_save_bit_17_swizzle().
771	*
772	* This is called when pinning backing storage again, since the kernel is free
773	* to move unpinned backing storage around (either by directly moving pages or
774	* by swapping them out and back in again).
775	*/
776	void
777	i915_gem_object_do_bit_17_swizzle(struct drm_i915_gem_object *obj,
778	struct sg_table *pages)
779	{
780	struct sgt_iter sgt_iter;
781	struct page *page;
782	int i;
783
784	if (obj->bit_17 == NULL)
785	return;
786
787	i = `0`;
788	for_each_sgt_page(page, sgt_iter, pages) {
789	char new_bit_17 = page_to_phys(page) >> `17`;
790
791	if ((new_bit_17 & `0x1`) != (test_bit(i, obj->bit_17) != `0`)) {
792	swizzle_page(page);
793	set_page_dirty(page);
794	}
795
796	i++;
797	}
798	}
799
800	/**
801	* i915_gem_object_save_bit_17_swizzle - save bit 17 swizzling
802	* @obj: i915 GEM buffer object
803	* @pages: the scattergather list of physical pages
804	*
805	* This function saves the bit 17 of each page frame number so that swizzling
806	* can be fixed up later on with i915_gem_object_do_bit_17_swizzle(). This must
807	* be called before the backing storage can be unpinned.
808	*/
809	void
810	i915_gem_object_save_bit_17_swizzle(struct drm_i915_gem_object *obj,
811	struct sg_table *pages)
812	{
813	const unsigned int page_count = obj->base.size >> PAGE_SHIFT;
814	struct sgt_iter sgt_iter;
815	struct page *page;
816	int i;
817
818	if (obj->bit_17 == NULL) {
819	obj->bit_17 = bitmap_zalloc(nbits: page_count, GFP_KERNEL);
820	if (obj->bit_17 == NULL) {
821	drm_err(obj->base.dev,
822	"Failed to allocate memory for bit 17 record\n");
823	return;
824	}
825	}
826
827	i = `0`;
828
829	for_each_sgt_page(page, sgt_iter, pages) {
830	if (page_to_phys(page) & (`1` << `17`))
831	__set_bit(i, obj->bit_17);
832	else
833	__clear_bit(i, obj->bit_17);
834	i++;
835	}
836	}
837
838	void intel_ggtt_init_fences(struct i915_ggtt *ggtt)
839	{
840	struct drm_i915_private *i915 = ggtt->vm.i915;
841	struct intel_uncore *uncore = ggtt->vm.gt->uncore;
842	int num_fences;
843	int i;
844
845	INIT_LIST_HEAD(list: &ggtt->fence_list);
846	INIT_LIST_HEAD(list: &ggtt->userfault_list);
847
848	detect_bit_6_swizzle(ggtt);
849
850	if (!i915_ggtt_has_aperture(ggtt))
851	num_fences = `0`;
852	else if (GRAPHICS_VER(i915) >= `7` &&
853	!(IS_VALLEYVIEW(i915) \|\| IS_CHERRYVIEW(i915)))
854	num_fences = `32`;
855	else if (GRAPHICS_VER(i915) >= `4` \|\|
856	IS_I945G(i915) \|\| IS_I945GM(i915) \|\|
857	IS_G33(i915) \|\| IS_PINEVIEW(i915))
858	num_fences = `16`;
859	else
860	num_fences = `8`;
861
862	if (intel_vgpu_active(i915))
863	num_fences = intel_uncore_read(uncore,
864	vgtif_reg(avail_rs.fence_num));
865	ggtt->fence_regs = kcalloc(n: num_fences,
866	size: sizeof(*ggtt->fence_regs),
867	GFP_KERNEL);
868	if (!ggtt->fence_regs)
869	num_fences = `0`;
870
871	/ Initialize fence registers to zero /
872	for (i = `0`; i < num_fences; i++) {
873	struct i915_fence_reg *fence = &ggtt->fence_regs[i];
874
875	i915_active_init(&fence->active, NULL, NULL, `0`);
876	fence->ggtt = ggtt;
877	fence->id = i;
878	list_add_tail(new: &fence->link, head: &ggtt->fence_list);
879	}
880	ggtt->num_fences = num_fences;
881
882	intel_ggtt_restore_fences(ggtt);
883	}
884
885	void intel_ggtt_fini_fences(struct i915_ggtt *ggtt)
886	{
887	int i;
888
889	for (i = `0`; i < ggtt->num_fences; i++) {
890	struct i915_fence_reg *fence = &ggtt->fence_regs[i];
891
892	i915_active_fini(ref: &fence->active);
893	}
894
895	kfree(objp: ggtt->fence_regs);
896	}
897
898	void intel_gt_init_swizzling(struct intel_gt *gt)
899	{
900	struct drm_i915_private *i915 = gt->i915;
901	struct intel_uncore *uncore = gt->uncore;
902
903	if (GRAPHICS_VER(i915) < `5` \|\|
904	to_gt(i915)->ggtt->bit_6_swizzle_x == I915_BIT_6_SWIZZLE_NONE)
905	return;
906
907	intel_uncore_rmw(uncore, DISP_ARB_CTL, clear: `0`, DISP_TILE_SURFACE_SWIZZLING);
908
909	if (GRAPHICS_VER(i915) == `5`)
910	return;
911
912	intel_uncore_rmw(uncore, TILECTL, clear: `0`, TILECTL_SWZCTL);
913
914	if (GRAPHICS_VER(i915) == `6`)
915	intel_uncore_write(uncore,
916	ARB_MODE,
917	_MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_SNB));
918	else if (GRAPHICS_VER(i915) == `7`)
919	intel_uncore_write(uncore,
920	ARB_MODE,
921	_MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_IVB));
922	else if (GRAPHICS_VER(i915) == `8`)
923	intel_uncore_write(uncore,
924	GAMTARBMODE,
925	_MASKED_BIT_ENABLE(ARB_MODE_SWIZZLE_BDW));
926	else
927	MISSING_CASE(GRAPHICS_VER(i915));
928	}
929

source code of linux/drivers/gpu/drm/i915/gt/intel_ggtt_fencing.c