1 | // SPDX-License-Identifier: GPL-2.0 |
2 | |
3 | #include <linux/version.h> |
4 | #include <linux/ptrace.h> |
5 | #include <uapi/linux/bpf.h> |
6 | #include <bpf/bpf_helpers.h> |
7 | |
8 | /* |
9 | * The CPU number, cstate number and pstate number are based |
10 | * on 96boards Hikey with octa CA53 CPUs. |
11 | * |
12 | * Every CPU have three idle states for cstate: |
13 | * WFI, CPU_OFF, CLUSTER_OFF |
14 | * |
15 | * Every CPU have 5 operating points: |
16 | * 208MHz, 432MHz, 729MHz, 960MHz, 1200MHz |
17 | * |
18 | * This code is based on these assumption and other platforms |
19 | * need to adjust these definitions. |
20 | */ |
21 | #define MAX_CPU 8 |
22 | #define MAX_PSTATE_ENTRIES 5 |
23 | #define MAX_CSTATE_ENTRIES 3 |
24 | |
25 | static int cpu_opps[] = { 208000, 432000, 729000, 960000, 1200000 }; |
26 | |
27 | /* |
28 | * my_map structure is used to record cstate and pstate index and |
29 | * timestamp (Idx, Ts), when new event incoming we need to update |
30 | * combination for new state index and timestamp (Idx`, Ts`). |
31 | * |
32 | * Based on (Idx, Ts) and (Idx`, Ts`) we can calculate the time |
33 | * interval for the previous state: Duration(Idx) = Ts` - Ts. |
34 | * |
35 | * Every CPU has one below array for recording state index and |
36 | * timestamp, and record for cstate and pstate saperately: |
37 | * |
38 | * +--------------------------+ |
39 | * | cstate timestamp | |
40 | * +--------------------------+ |
41 | * | cstate index | |
42 | * +--------------------------+ |
43 | * | pstate timestamp | |
44 | * +--------------------------+ |
45 | * | pstate index | |
46 | * +--------------------------+ |
47 | */ |
48 | #define MAP_OFF_CSTATE_TIME 0 |
49 | #define MAP_OFF_CSTATE_IDX 1 |
50 | #define MAP_OFF_PSTATE_TIME 2 |
51 | #define MAP_OFF_PSTATE_IDX 3 |
52 | #define MAP_OFF_NUM 4 |
53 | |
54 | struct { |
55 | __uint(type, BPF_MAP_TYPE_ARRAY); |
56 | __type(key, u32); |
57 | __type(value, u64); |
58 | __uint(max_entries, MAX_CPU * MAP_OFF_NUM); |
59 | } my_map SEC(".maps" ); |
60 | |
61 | /* cstate_duration records duration time for every idle state per CPU */ |
62 | struct { |
63 | __uint(type, BPF_MAP_TYPE_ARRAY); |
64 | __type(key, u32); |
65 | __type(value, u64); |
66 | __uint(max_entries, MAX_CPU * MAX_CSTATE_ENTRIES); |
67 | } cstate_duration SEC(".maps" ); |
68 | |
69 | /* pstate_duration records duration time for every operating point per CPU */ |
70 | struct { |
71 | __uint(type, BPF_MAP_TYPE_ARRAY); |
72 | __type(key, u32); |
73 | __type(value, u64); |
74 | __uint(max_entries, MAX_CPU * MAX_PSTATE_ENTRIES); |
75 | } pstate_duration SEC(".maps" ); |
76 | |
77 | /* |
78 | * The trace events for cpu_idle and cpu_frequency are taken from: |
79 | * /sys/kernel/tracing/events/power/cpu_idle/format |
80 | * /sys/kernel/tracing/events/power/cpu_frequency/format |
81 | * |
82 | * These two events have same format, so define one common structure. |
83 | */ |
84 | struct cpu_args { |
85 | u64 pad; |
86 | u32 state; |
87 | u32 cpu_id; |
88 | }; |
89 | |
90 | /* calculate pstate index, returns MAX_PSTATE_ENTRIES for failure */ |
91 | static u32 find_cpu_pstate_idx(u32 frequency) |
92 | { |
93 | u32 i; |
94 | |
95 | for (i = 0; i < sizeof(cpu_opps) / sizeof(u32); i++) { |
96 | if (frequency == cpu_opps[i]) |
97 | return i; |
98 | } |
99 | |
100 | return i; |
101 | } |
102 | |
103 | SEC("tracepoint/power/cpu_idle" ) |
104 | int bpf_prog1(struct cpu_args *ctx) |
105 | { |
106 | u64 *cts, *pts, *cstate, *pstate, prev_state, cur_ts, delta; |
107 | u32 key, cpu, pstate_idx; |
108 | u64 *val; |
109 | |
110 | if (ctx->cpu_id > MAX_CPU) |
111 | return 0; |
112 | |
113 | cpu = ctx->cpu_id; |
114 | |
115 | key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_TIME; |
116 | cts = bpf_map_lookup_elem(&my_map, &key); |
117 | if (!cts) |
118 | return 0; |
119 | |
120 | key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_IDX; |
121 | cstate = bpf_map_lookup_elem(&my_map, &key); |
122 | if (!cstate) |
123 | return 0; |
124 | |
125 | key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_TIME; |
126 | pts = bpf_map_lookup_elem(&my_map, &key); |
127 | if (!pts) |
128 | return 0; |
129 | |
130 | key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_IDX; |
131 | pstate = bpf_map_lookup_elem(&my_map, &key); |
132 | if (!pstate) |
133 | return 0; |
134 | |
135 | prev_state = *cstate; |
136 | *cstate = ctx->state; |
137 | |
138 | if (!*cts) { |
139 | *cts = bpf_ktime_get_ns(); |
140 | return 0; |
141 | } |
142 | |
143 | cur_ts = bpf_ktime_get_ns(); |
144 | delta = cur_ts - *cts; |
145 | *cts = cur_ts; |
146 | |
147 | /* |
148 | * When state doesn't equal to (u32)-1, the cpu will enter |
149 | * one idle state; for this case we need to record interval |
150 | * for the pstate. |
151 | * |
152 | * OPP2 |
153 | * +---------------------+ |
154 | * OPP1 | | |
155 | * ---------+ | |
156 | * | Idle state |
157 | * +--------------- |
158 | * |
159 | * |<- pstate duration ->| |
160 | * ^ ^ |
161 | * pts cur_ts |
162 | */ |
163 | if (ctx->state != (u32)-1) { |
164 | |
165 | /* record pstate after have first cpu_frequency event */ |
166 | if (!*pts) |
167 | return 0; |
168 | |
169 | delta = cur_ts - *pts; |
170 | |
171 | pstate_idx = find_cpu_pstate_idx(*pstate); |
172 | if (pstate_idx >= MAX_PSTATE_ENTRIES) |
173 | return 0; |
174 | |
175 | key = cpu * MAX_PSTATE_ENTRIES + pstate_idx; |
176 | val = bpf_map_lookup_elem(&pstate_duration, &key); |
177 | if (val) |
178 | __sync_fetch_and_add((long *)val, delta); |
179 | |
180 | /* |
181 | * When state equal to (u32)-1, the cpu just exits from one |
182 | * specific idle state; for this case we need to record |
183 | * interval for the pstate. |
184 | * |
185 | * OPP2 |
186 | * -----------+ |
187 | * | OPP1 |
188 | * | +----------- |
189 | * | Idle state | |
190 | * +---------------------+ |
191 | * |
192 | * |<- cstate duration ->| |
193 | * ^ ^ |
194 | * cts cur_ts |
195 | */ |
196 | } else { |
197 | |
198 | key = cpu * MAX_CSTATE_ENTRIES + prev_state; |
199 | val = bpf_map_lookup_elem(&cstate_duration, &key); |
200 | if (val) |
201 | __sync_fetch_and_add((long *)val, delta); |
202 | } |
203 | |
204 | /* Update timestamp for pstate as new start time */ |
205 | if (*pts) |
206 | *pts = cur_ts; |
207 | |
208 | return 0; |
209 | } |
210 | |
211 | SEC("tracepoint/power/cpu_frequency" ) |
212 | int bpf_prog2(struct cpu_args *ctx) |
213 | { |
214 | u64 *pts, *cstate, *pstate, prev_state, cur_ts, delta; |
215 | u32 key, cpu, pstate_idx; |
216 | u64 *val; |
217 | |
218 | cpu = ctx->cpu_id; |
219 | |
220 | key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_TIME; |
221 | pts = bpf_map_lookup_elem(&my_map, &key); |
222 | if (!pts) |
223 | return 0; |
224 | |
225 | key = cpu * MAP_OFF_NUM + MAP_OFF_PSTATE_IDX; |
226 | pstate = bpf_map_lookup_elem(&my_map, &key); |
227 | if (!pstate) |
228 | return 0; |
229 | |
230 | key = cpu * MAP_OFF_NUM + MAP_OFF_CSTATE_IDX; |
231 | cstate = bpf_map_lookup_elem(&my_map, &key); |
232 | if (!cstate) |
233 | return 0; |
234 | |
235 | prev_state = *pstate; |
236 | *pstate = ctx->state; |
237 | |
238 | if (!*pts) { |
239 | *pts = bpf_ktime_get_ns(); |
240 | return 0; |
241 | } |
242 | |
243 | cur_ts = bpf_ktime_get_ns(); |
244 | delta = cur_ts - *pts; |
245 | *pts = cur_ts; |
246 | |
247 | /* When CPU is in idle, bail out to skip pstate statistics */ |
248 | if (*cstate != (u32)(-1)) |
249 | return 0; |
250 | |
251 | /* |
252 | * The cpu changes to another different OPP (in below diagram |
253 | * change frequency from OPP3 to OPP1), need recording interval |
254 | * for previous frequency OPP3 and update timestamp as start |
255 | * time for new frequency OPP1. |
256 | * |
257 | * OPP3 |
258 | * +---------------------+ |
259 | * OPP2 | | |
260 | * ---------+ | |
261 | * | OPP1 |
262 | * +--------------- |
263 | * |
264 | * |<- pstate duration ->| |
265 | * ^ ^ |
266 | * pts cur_ts |
267 | */ |
268 | pstate_idx = find_cpu_pstate_idx(*pstate); |
269 | if (pstate_idx >= MAX_PSTATE_ENTRIES) |
270 | return 0; |
271 | |
272 | key = cpu * MAX_PSTATE_ENTRIES + pstate_idx; |
273 | val = bpf_map_lookup_elem(&pstate_duration, &key); |
274 | if (val) |
275 | __sync_fetch_and_add((long *)val, delta); |
276 | |
277 | return 0; |
278 | } |
279 | |
280 | char _license[] SEC("license" ) = "GPL" ; |
281 | u32 _version SEC("version" ) = LINUX_VERSION_CODE; |
282 | |