1// SPDX-License-Identifier: GPL-2.0
2/*
3 * NVMe over Fabrics TCP host.
4 * Copyright (c) 2018 Lightbits Labs. All rights reserved.
5 */
6#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
7#include <linux/module.h>
8#include <linux/init.h>
9#include <linux/slab.h>
10#include <linux/err.h>
11#include <linux/key.h>
12#include <linux/nvme-tcp.h>
13#include <linux/nvme-keyring.h>
14#include <net/sock.h>
15#include <net/tcp.h>
16#include <net/tls.h>
17#include <net/tls_prot.h>
18#include <net/handshake.h>
19#include <linux/blk-mq.h>
20#include <crypto/hash.h>
21#include <net/busy_poll.h>
22#include <trace/events/sock.h>
23
24#include "nvme.h"
25#include "fabrics.h"
26
27struct nvme_tcp_queue;
28
29/* Define the socket priority to use for connections were it is desirable
30 * that the NIC consider performing optimized packet processing or filtering.
31 * A non-zero value being sufficient to indicate general consideration of any
32 * possible optimization. Making it a module param allows for alternative
33 * values that may be unique for some NIC implementations.
34 */
35static int so_priority;
36module_param(so_priority, int, 0644);
37MODULE_PARM_DESC(so_priority, "nvme tcp socket optimize priority");
38
39/*
40 * Use the unbound workqueue for nvme_tcp_wq, then we can set the cpu affinity
41 * from sysfs.
42 */
43static bool wq_unbound;
44module_param(wq_unbound, bool, 0644);
45MODULE_PARM_DESC(wq_unbound, "Use unbound workqueue for nvme-tcp IO context (default false)");
46
47/*
48 * TLS handshake timeout
49 */
50static int tls_handshake_timeout = 10;
51#ifdef CONFIG_NVME_TCP_TLS
52module_param(tls_handshake_timeout, int, 0644);
53MODULE_PARM_DESC(tls_handshake_timeout,
54 "nvme TLS handshake timeout in seconds (default 10)");
55#endif
56
57#ifdef CONFIG_DEBUG_LOCK_ALLOC
58/* lockdep can detect a circular dependency of the form
59 * sk_lock -> mmap_lock (page fault) -> fs locks -> sk_lock
60 * because dependencies are tracked for both nvme-tcp and user contexts. Using
61 * a separate class prevents lockdep from conflating nvme-tcp socket use with
62 * user-space socket API use.
63 */
64static struct lock_class_key nvme_tcp_sk_key[2];
65static struct lock_class_key nvme_tcp_slock_key[2];
66
67static void nvme_tcp_reclassify_socket(struct socket *sock)
68{
69 struct sock *sk = sock->sk;
70
71 if (WARN_ON_ONCE(!sock_allow_reclassification(sk)))
72 return;
73
74 switch (sk->sk_family) {
75 case AF_INET:
76 sock_lock_init_class_and_name(sk, "slock-AF_INET-NVME",
77 &nvme_tcp_slock_key[0],
78 "sk_lock-AF_INET-NVME",
79 &nvme_tcp_sk_key[0]);
80 break;
81 case AF_INET6:
82 sock_lock_init_class_and_name(sk, "slock-AF_INET6-NVME",
83 &nvme_tcp_slock_key[1],
84 "sk_lock-AF_INET6-NVME",
85 &nvme_tcp_sk_key[1]);
86 break;
87 default:
88 WARN_ON_ONCE(1);
89 }
90}
91#else
92static void nvme_tcp_reclassify_socket(struct socket *sock) { }
93#endif
94
95enum nvme_tcp_send_state {
96 NVME_TCP_SEND_CMD_PDU = 0,
97 NVME_TCP_SEND_H2C_PDU,
98 NVME_TCP_SEND_DATA,
99 NVME_TCP_SEND_DDGST,
100};
101
102struct nvme_tcp_request {
103 struct nvme_request req;
104 void *pdu;
105 struct nvme_tcp_queue *queue;
106 u32 data_len;
107 u32 pdu_len;
108 u32 pdu_sent;
109 u32 h2cdata_left;
110 u32 h2cdata_offset;
111 u16 ttag;
112 __le16 status;
113 struct list_head entry;
114 struct llist_node lentry;
115 __le32 ddgst;
116
117 struct bio *curr_bio;
118 struct iov_iter iter;
119
120 /* send state */
121 size_t offset;
122 size_t data_sent;
123 enum nvme_tcp_send_state state;
124};
125
126enum nvme_tcp_queue_flags {
127 NVME_TCP_Q_ALLOCATED = 0,
128 NVME_TCP_Q_LIVE = 1,
129 NVME_TCP_Q_POLLING = 2,
130};
131
132enum nvme_tcp_recv_state {
133 NVME_TCP_RECV_PDU = 0,
134 NVME_TCP_RECV_DATA,
135 NVME_TCP_RECV_DDGST,
136};
137
138struct nvme_tcp_ctrl;
139struct nvme_tcp_queue {
140 struct socket *sock;
141 struct work_struct io_work;
142 int io_cpu;
143
144 struct mutex queue_lock;
145 struct mutex send_mutex;
146 struct llist_head req_list;
147 struct list_head send_list;
148
149 /* recv state */
150 void *pdu;
151 int pdu_remaining;
152 int pdu_offset;
153 size_t data_remaining;
154 size_t ddgst_remaining;
155 unsigned int nr_cqe;
156
157 /* send state */
158 struct nvme_tcp_request *request;
159
160 u32 maxh2cdata;
161 size_t cmnd_capsule_len;
162 struct nvme_tcp_ctrl *ctrl;
163 unsigned long flags;
164 bool rd_enabled;
165
166 bool hdr_digest;
167 bool data_digest;
168 struct ahash_request *rcv_hash;
169 struct ahash_request *snd_hash;
170 __le32 exp_ddgst;
171 __le32 recv_ddgst;
172 struct completion tls_complete;
173 int tls_err;
174 struct page_frag_cache pf_cache;
175
176 void (*state_change)(struct sock *);
177 void (*data_ready)(struct sock *);
178 void (*write_space)(struct sock *);
179};
180
181struct nvme_tcp_ctrl {
182 /* read only in the hot path */
183 struct nvme_tcp_queue *queues;
184 struct blk_mq_tag_set tag_set;
185
186 /* other member variables */
187 struct list_head list;
188 struct blk_mq_tag_set admin_tag_set;
189 struct sockaddr_storage addr;
190 struct sockaddr_storage src_addr;
191 struct nvme_ctrl ctrl;
192
193 struct work_struct err_work;
194 struct delayed_work connect_work;
195 struct nvme_tcp_request async_req;
196 u32 io_queues[HCTX_MAX_TYPES];
197};
198
199static LIST_HEAD(nvme_tcp_ctrl_list);
200static DEFINE_MUTEX(nvme_tcp_ctrl_mutex);
201static struct workqueue_struct *nvme_tcp_wq;
202static const struct blk_mq_ops nvme_tcp_mq_ops;
203static const struct blk_mq_ops nvme_tcp_admin_mq_ops;
204static int nvme_tcp_try_send(struct nvme_tcp_queue *queue);
205
206static inline struct nvme_tcp_ctrl *to_tcp_ctrl(struct nvme_ctrl *ctrl)
207{
208 return container_of(ctrl, struct nvme_tcp_ctrl, ctrl);
209}
210
211static inline int nvme_tcp_queue_id(struct nvme_tcp_queue *queue)
212{
213 return queue - queue->ctrl->queues;
214}
215
216static inline bool nvme_tcp_tls(struct nvme_ctrl *ctrl)
217{
218 if (!IS_ENABLED(CONFIG_NVME_TCP_TLS))
219 return 0;
220
221 return ctrl->opts->tls;
222}
223
224static inline struct blk_mq_tags *nvme_tcp_tagset(struct nvme_tcp_queue *queue)
225{
226 u32 queue_idx = nvme_tcp_queue_id(queue);
227
228 if (queue_idx == 0)
229 return queue->ctrl->admin_tag_set.tags[queue_idx];
230 return queue->ctrl->tag_set.tags[queue_idx - 1];
231}
232
233static inline u8 nvme_tcp_hdgst_len(struct nvme_tcp_queue *queue)
234{
235 return queue->hdr_digest ? NVME_TCP_DIGEST_LENGTH : 0;
236}
237
238static inline u8 nvme_tcp_ddgst_len(struct nvme_tcp_queue *queue)
239{
240 return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0;
241}
242
243static inline void *nvme_tcp_req_cmd_pdu(struct nvme_tcp_request *req)
244{
245 return req->pdu;
246}
247
248static inline void *nvme_tcp_req_data_pdu(struct nvme_tcp_request *req)
249{
250 /* use the pdu space in the back for the data pdu */
251 return req->pdu + sizeof(struct nvme_tcp_cmd_pdu) -
252 sizeof(struct nvme_tcp_data_pdu);
253}
254
255static inline size_t nvme_tcp_inline_data_size(struct nvme_tcp_request *req)
256{
257 if (nvme_is_fabrics(cmd: req->req.cmd))
258 return NVME_TCP_ADMIN_CCSZ;
259 return req->queue->cmnd_capsule_len - sizeof(struct nvme_command);
260}
261
262static inline bool nvme_tcp_async_req(struct nvme_tcp_request *req)
263{
264 return req == &req->queue->ctrl->async_req;
265}
266
267static inline bool nvme_tcp_has_inline_data(struct nvme_tcp_request *req)
268{
269 struct request *rq;
270
271 if (unlikely(nvme_tcp_async_req(req)))
272 return false; /* async events don't have a request */
273
274 rq = blk_mq_rq_from_pdu(pdu: req);
275
276 return rq_data_dir(rq) == WRITE && req->data_len &&
277 req->data_len <= nvme_tcp_inline_data_size(req);
278}
279
280static inline struct page *nvme_tcp_req_cur_page(struct nvme_tcp_request *req)
281{
282 return req->iter.bvec->bv_page;
283}
284
285static inline size_t nvme_tcp_req_cur_offset(struct nvme_tcp_request *req)
286{
287 return req->iter.bvec->bv_offset + req->iter.iov_offset;
288}
289
290static inline size_t nvme_tcp_req_cur_length(struct nvme_tcp_request *req)
291{
292 return min_t(size_t, iov_iter_single_seg_count(&req->iter),
293 req->pdu_len - req->pdu_sent);
294}
295
296static inline size_t nvme_tcp_pdu_data_left(struct nvme_tcp_request *req)
297{
298 return rq_data_dir(blk_mq_rq_from_pdu(req)) == WRITE ?
299 req->pdu_len - req->pdu_sent : 0;
300}
301
302static inline size_t nvme_tcp_pdu_last_send(struct nvme_tcp_request *req,
303 int len)
304{
305 return nvme_tcp_pdu_data_left(req) <= len;
306}
307
308static void nvme_tcp_init_iter(struct nvme_tcp_request *req,
309 unsigned int dir)
310{
311 struct request *rq = blk_mq_rq_from_pdu(pdu: req);
312 struct bio_vec *vec;
313 unsigned int size;
314 int nr_bvec;
315 size_t offset;
316
317 if (rq->rq_flags & RQF_SPECIAL_PAYLOAD) {
318 vec = &rq->special_vec;
319 nr_bvec = 1;
320 size = blk_rq_payload_bytes(rq);
321 offset = 0;
322 } else {
323 struct bio *bio = req->curr_bio;
324 struct bvec_iter bi;
325 struct bio_vec bv;
326
327 vec = __bvec_iter_bvec(bio->bi_io_vec, bio->bi_iter);
328 nr_bvec = 0;
329 bio_for_each_bvec(bv, bio, bi) {
330 nr_bvec++;
331 }
332 size = bio->bi_iter.bi_size;
333 offset = bio->bi_iter.bi_bvec_done;
334 }
335
336 iov_iter_bvec(i: &req->iter, direction: dir, bvec: vec, nr_segs: nr_bvec, count: size);
337 req->iter.iov_offset = offset;
338}
339
340static inline void nvme_tcp_advance_req(struct nvme_tcp_request *req,
341 int len)
342{
343 req->data_sent += len;
344 req->pdu_sent += len;
345 iov_iter_advance(i: &req->iter, bytes: len);
346 if (!iov_iter_count(i: &req->iter) &&
347 req->data_sent < req->data_len) {
348 req->curr_bio = req->curr_bio->bi_next;
349 nvme_tcp_init_iter(req, ITER_SOURCE);
350 }
351}
352
353static inline void nvme_tcp_send_all(struct nvme_tcp_queue *queue)
354{
355 int ret;
356
357 /* drain the send queue as much as we can... */
358 do {
359 ret = nvme_tcp_try_send(queue);
360 } while (ret > 0);
361}
362
363static inline bool nvme_tcp_queue_more(struct nvme_tcp_queue *queue)
364{
365 return !list_empty(head: &queue->send_list) ||
366 !llist_empty(head: &queue->req_list);
367}
368
369static inline void nvme_tcp_queue_request(struct nvme_tcp_request *req,
370 bool sync, bool last)
371{
372 struct nvme_tcp_queue *queue = req->queue;
373 bool empty;
374
375 empty = llist_add(new: &req->lentry, head: &queue->req_list) &&
376 list_empty(head: &queue->send_list) && !queue->request;
377
378 /*
379 * if we're the first on the send_list and we can try to send
380 * directly, otherwise queue io_work. Also, only do that if we
381 * are on the same cpu, so we don't introduce contention.
382 */
383 if (queue->io_cpu == raw_smp_processor_id() &&
384 sync && empty && mutex_trylock(lock: &queue->send_mutex)) {
385 nvme_tcp_send_all(queue);
386 mutex_unlock(lock: &queue->send_mutex);
387 }
388
389 if (last && nvme_tcp_queue_more(queue))
390 queue_work_on(cpu: queue->io_cpu, wq: nvme_tcp_wq, work: &queue->io_work);
391}
392
393static void nvme_tcp_process_req_list(struct nvme_tcp_queue *queue)
394{
395 struct nvme_tcp_request *req;
396 struct llist_node *node;
397
398 for (node = llist_del_all(head: &queue->req_list); node; node = node->next) {
399 req = llist_entry(node, struct nvme_tcp_request, lentry);
400 list_add(new: &req->entry, head: &queue->send_list);
401 }
402}
403
404static inline struct nvme_tcp_request *
405nvme_tcp_fetch_request(struct nvme_tcp_queue *queue)
406{
407 struct nvme_tcp_request *req;
408
409 req = list_first_entry_or_null(&queue->send_list,
410 struct nvme_tcp_request, entry);
411 if (!req) {
412 nvme_tcp_process_req_list(queue);
413 req = list_first_entry_or_null(&queue->send_list,
414 struct nvme_tcp_request, entry);
415 if (unlikely(!req))
416 return NULL;
417 }
418
419 list_del(entry: &req->entry);
420 return req;
421}
422
423static inline void nvme_tcp_ddgst_final(struct ahash_request *hash,
424 __le32 *dgst)
425{
426 ahash_request_set_crypt(req: hash, NULL, result: (u8 *)dgst, nbytes: 0);
427 crypto_ahash_final(req: hash);
428}
429
430static inline void nvme_tcp_ddgst_update(struct ahash_request *hash,
431 struct page *page, off_t off, size_t len)
432{
433 struct scatterlist sg;
434
435 sg_init_table(&sg, 1);
436 sg_set_page(sg: &sg, page, len, offset: off);
437 ahash_request_set_crypt(req: hash, src: &sg, NULL, nbytes: len);
438 crypto_ahash_update(req: hash);
439}
440
441static inline void nvme_tcp_hdgst(struct ahash_request *hash,
442 void *pdu, size_t len)
443{
444 struct scatterlist sg;
445
446 sg_init_one(&sg, pdu, len);
447 ahash_request_set_crypt(req: hash, src: &sg, result: pdu + len, nbytes: len);
448 crypto_ahash_digest(req: hash);
449}
450
451static int nvme_tcp_verify_hdgst(struct nvme_tcp_queue *queue,
452 void *pdu, size_t pdu_len)
453{
454 struct nvme_tcp_hdr *hdr = pdu;
455 __le32 recv_digest;
456 __le32 exp_digest;
457
458 if (unlikely(!(hdr->flags & NVME_TCP_F_HDGST))) {
459 dev_err(queue->ctrl->ctrl.device,
460 "queue %d: header digest flag is cleared\n",
461 nvme_tcp_queue_id(queue));
462 return -EPROTO;
463 }
464
465 recv_digest = *(__le32 *)(pdu + hdr->hlen);
466 nvme_tcp_hdgst(hash: queue->rcv_hash, pdu, len: pdu_len);
467 exp_digest = *(__le32 *)(pdu + hdr->hlen);
468 if (recv_digest != exp_digest) {
469 dev_err(queue->ctrl->ctrl.device,
470 "header digest error: recv %#x expected %#x\n",
471 le32_to_cpu(recv_digest), le32_to_cpu(exp_digest));
472 return -EIO;
473 }
474
475 return 0;
476}
477
478static int nvme_tcp_check_ddgst(struct nvme_tcp_queue *queue, void *pdu)
479{
480 struct nvme_tcp_hdr *hdr = pdu;
481 u8 digest_len = nvme_tcp_hdgst_len(queue);
482 u32 len;
483
484 len = le32_to_cpu(hdr->plen) - hdr->hlen -
485 ((hdr->flags & NVME_TCP_F_HDGST) ? digest_len : 0);
486
487 if (unlikely(len && !(hdr->flags & NVME_TCP_F_DDGST))) {
488 dev_err(queue->ctrl->ctrl.device,
489 "queue %d: data digest flag is cleared\n",
490 nvme_tcp_queue_id(queue));
491 return -EPROTO;
492 }
493 crypto_ahash_init(req: queue->rcv_hash);
494
495 return 0;
496}
497
498static void nvme_tcp_exit_request(struct blk_mq_tag_set *set,
499 struct request *rq, unsigned int hctx_idx)
500{
501 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
502
503 page_frag_free(addr: req->pdu);
504}
505
506static int nvme_tcp_init_request(struct blk_mq_tag_set *set,
507 struct request *rq, unsigned int hctx_idx,
508 unsigned int numa_node)
509{
510 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(ctrl: set->driver_data);
511 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
512 struct nvme_tcp_cmd_pdu *pdu;
513 int queue_idx = (set == &ctrl->tag_set) ? hctx_idx + 1 : 0;
514 struct nvme_tcp_queue *queue = &ctrl->queues[queue_idx];
515 u8 hdgst = nvme_tcp_hdgst_len(queue);
516
517 req->pdu = page_frag_alloc(nc: &queue->pf_cache,
518 fragsz: sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
519 GFP_KERNEL | __GFP_ZERO);
520 if (!req->pdu)
521 return -ENOMEM;
522
523 pdu = req->pdu;
524 req->queue = queue;
525 nvme_req(req: rq)->ctrl = &ctrl->ctrl;
526 nvme_req(req: rq)->cmd = &pdu->cmd;
527
528 return 0;
529}
530
531static int nvme_tcp_init_hctx(struct blk_mq_hw_ctx *hctx, void *data,
532 unsigned int hctx_idx)
533{
534 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(ctrl: data);
535 struct nvme_tcp_queue *queue = &ctrl->queues[hctx_idx + 1];
536
537 hctx->driver_data = queue;
538 return 0;
539}
540
541static int nvme_tcp_init_admin_hctx(struct blk_mq_hw_ctx *hctx, void *data,
542 unsigned int hctx_idx)
543{
544 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(ctrl: data);
545 struct nvme_tcp_queue *queue = &ctrl->queues[0];
546
547 hctx->driver_data = queue;
548 return 0;
549}
550
551static enum nvme_tcp_recv_state
552nvme_tcp_recv_state(struct nvme_tcp_queue *queue)
553{
554 return (queue->pdu_remaining) ? NVME_TCP_RECV_PDU :
555 (queue->ddgst_remaining) ? NVME_TCP_RECV_DDGST :
556 NVME_TCP_RECV_DATA;
557}
558
559static void nvme_tcp_init_recv_ctx(struct nvme_tcp_queue *queue)
560{
561 queue->pdu_remaining = sizeof(struct nvme_tcp_rsp_pdu) +
562 nvme_tcp_hdgst_len(queue);
563 queue->pdu_offset = 0;
564 queue->data_remaining = -1;
565 queue->ddgst_remaining = 0;
566}
567
568static void nvme_tcp_error_recovery(struct nvme_ctrl *ctrl)
569{
570 if (!nvme_change_ctrl_state(ctrl, new_state: NVME_CTRL_RESETTING))
571 return;
572
573 dev_warn(ctrl->device, "starting error recovery\n");
574 queue_work(wq: nvme_reset_wq, work: &to_tcp_ctrl(ctrl)->err_work);
575}
576
577static int nvme_tcp_process_nvme_cqe(struct nvme_tcp_queue *queue,
578 struct nvme_completion *cqe)
579{
580 struct nvme_tcp_request *req;
581 struct request *rq;
582
583 rq = nvme_find_rq(tags: nvme_tcp_tagset(queue), command_id: cqe->command_id);
584 if (!rq) {
585 dev_err(queue->ctrl->ctrl.device,
586 "got bad cqe.command_id %#x on queue %d\n",
587 cqe->command_id, nvme_tcp_queue_id(queue));
588 nvme_tcp_error_recovery(ctrl: &queue->ctrl->ctrl);
589 return -EINVAL;
590 }
591
592 req = blk_mq_rq_to_pdu(rq);
593 if (req->status == cpu_to_le16(NVME_SC_SUCCESS))
594 req->status = cqe->status;
595
596 if (!nvme_try_complete_req(req: rq, status: req->status, result: cqe->result))
597 nvme_complete_rq(req: rq);
598 queue->nr_cqe++;
599
600 return 0;
601}
602
603static int nvme_tcp_handle_c2h_data(struct nvme_tcp_queue *queue,
604 struct nvme_tcp_data_pdu *pdu)
605{
606 struct request *rq;
607
608 rq = nvme_find_rq(tags: nvme_tcp_tagset(queue), command_id: pdu->command_id);
609 if (!rq) {
610 dev_err(queue->ctrl->ctrl.device,
611 "got bad c2hdata.command_id %#x on queue %d\n",
612 pdu->command_id, nvme_tcp_queue_id(queue));
613 return -ENOENT;
614 }
615
616 if (!blk_rq_payload_bytes(rq)) {
617 dev_err(queue->ctrl->ctrl.device,
618 "queue %d tag %#x unexpected data\n",
619 nvme_tcp_queue_id(queue), rq->tag);
620 return -EIO;
621 }
622
623 queue->data_remaining = le32_to_cpu(pdu->data_length);
624
625 if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS &&
626 unlikely(!(pdu->hdr.flags & NVME_TCP_F_DATA_LAST))) {
627 dev_err(queue->ctrl->ctrl.device,
628 "queue %d tag %#x SUCCESS set but not last PDU\n",
629 nvme_tcp_queue_id(queue), rq->tag);
630 nvme_tcp_error_recovery(ctrl: &queue->ctrl->ctrl);
631 return -EPROTO;
632 }
633
634 return 0;
635}
636
637static int nvme_tcp_handle_comp(struct nvme_tcp_queue *queue,
638 struct nvme_tcp_rsp_pdu *pdu)
639{
640 struct nvme_completion *cqe = &pdu->cqe;
641 int ret = 0;
642
643 /*
644 * AEN requests are special as they don't time out and can
645 * survive any kind of queue freeze and often don't respond to
646 * aborts. We don't even bother to allocate a struct request
647 * for them but rather special case them here.
648 */
649 if (unlikely(nvme_is_aen_req(nvme_tcp_queue_id(queue),
650 cqe->command_id)))
651 nvme_complete_async_event(ctrl: &queue->ctrl->ctrl, status: cqe->status,
652 res: &cqe->result);
653 else
654 ret = nvme_tcp_process_nvme_cqe(queue, cqe);
655
656 return ret;
657}
658
659static void nvme_tcp_setup_h2c_data_pdu(struct nvme_tcp_request *req)
660{
661 struct nvme_tcp_data_pdu *data = nvme_tcp_req_data_pdu(req);
662 struct nvme_tcp_queue *queue = req->queue;
663 struct request *rq = blk_mq_rq_from_pdu(pdu: req);
664 u32 h2cdata_sent = req->pdu_len;
665 u8 hdgst = nvme_tcp_hdgst_len(queue);
666 u8 ddgst = nvme_tcp_ddgst_len(queue);
667
668 req->state = NVME_TCP_SEND_H2C_PDU;
669 req->offset = 0;
670 req->pdu_len = min(req->h2cdata_left, queue->maxh2cdata);
671 req->pdu_sent = 0;
672 req->h2cdata_left -= req->pdu_len;
673 req->h2cdata_offset += h2cdata_sent;
674
675 memset(data, 0, sizeof(*data));
676 data->hdr.type = nvme_tcp_h2c_data;
677 if (!req->h2cdata_left)
678 data->hdr.flags = NVME_TCP_F_DATA_LAST;
679 if (queue->hdr_digest)
680 data->hdr.flags |= NVME_TCP_F_HDGST;
681 if (queue->data_digest)
682 data->hdr.flags |= NVME_TCP_F_DDGST;
683 data->hdr.hlen = sizeof(*data);
684 data->hdr.pdo = data->hdr.hlen + hdgst;
685 data->hdr.plen =
686 cpu_to_le32(data->hdr.hlen + hdgst + req->pdu_len + ddgst);
687 data->ttag = req->ttag;
688 data->command_id = nvme_cid(rq);
689 data->data_offset = cpu_to_le32(req->h2cdata_offset);
690 data->data_length = cpu_to_le32(req->pdu_len);
691}
692
693static int nvme_tcp_handle_r2t(struct nvme_tcp_queue *queue,
694 struct nvme_tcp_r2t_pdu *pdu)
695{
696 struct nvme_tcp_request *req;
697 struct request *rq;
698 u32 r2t_length = le32_to_cpu(pdu->r2t_length);
699 u32 r2t_offset = le32_to_cpu(pdu->r2t_offset);
700
701 rq = nvme_find_rq(tags: nvme_tcp_tagset(queue), command_id: pdu->command_id);
702 if (!rq) {
703 dev_err(queue->ctrl->ctrl.device,
704 "got bad r2t.command_id %#x on queue %d\n",
705 pdu->command_id, nvme_tcp_queue_id(queue));
706 return -ENOENT;
707 }
708 req = blk_mq_rq_to_pdu(rq);
709
710 if (unlikely(!r2t_length)) {
711 dev_err(queue->ctrl->ctrl.device,
712 "req %d r2t len is %u, probably a bug...\n",
713 rq->tag, r2t_length);
714 return -EPROTO;
715 }
716
717 if (unlikely(req->data_sent + r2t_length > req->data_len)) {
718 dev_err(queue->ctrl->ctrl.device,
719 "req %d r2t len %u exceeded data len %u (%zu sent)\n",
720 rq->tag, r2t_length, req->data_len, req->data_sent);
721 return -EPROTO;
722 }
723
724 if (unlikely(r2t_offset < req->data_sent)) {
725 dev_err(queue->ctrl->ctrl.device,
726 "req %d unexpected r2t offset %u (expected %zu)\n",
727 rq->tag, r2t_offset, req->data_sent);
728 return -EPROTO;
729 }
730
731 req->pdu_len = 0;
732 req->h2cdata_left = r2t_length;
733 req->h2cdata_offset = r2t_offset;
734 req->ttag = pdu->ttag;
735
736 nvme_tcp_setup_h2c_data_pdu(req);
737 nvme_tcp_queue_request(req, sync: false, last: true);
738
739 return 0;
740}
741
742static int nvme_tcp_recv_pdu(struct nvme_tcp_queue *queue, struct sk_buff *skb,
743 unsigned int *offset, size_t *len)
744{
745 struct nvme_tcp_hdr *hdr;
746 char *pdu = queue->pdu;
747 size_t rcv_len = min_t(size_t, *len, queue->pdu_remaining);
748 int ret;
749
750 ret = skb_copy_bits(skb, offset: *offset,
751 to: &pdu[queue->pdu_offset], len: rcv_len);
752 if (unlikely(ret))
753 return ret;
754
755 queue->pdu_remaining -= rcv_len;
756 queue->pdu_offset += rcv_len;
757 *offset += rcv_len;
758 *len -= rcv_len;
759 if (queue->pdu_remaining)
760 return 0;
761
762 hdr = queue->pdu;
763 if (queue->hdr_digest) {
764 ret = nvme_tcp_verify_hdgst(queue, pdu: queue->pdu, pdu_len: hdr->hlen);
765 if (unlikely(ret))
766 return ret;
767 }
768
769
770 if (queue->data_digest) {
771 ret = nvme_tcp_check_ddgst(queue, pdu: queue->pdu);
772 if (unlikely(ret))
773 return ret;
774 }
775
776 switch (hdr->type) {
777 case nvme_tcp_c2h_data:
778 return nvme_tcp_handle_c2h_data(queue, pdu: (void *)queue->pdu);
779 case nvme_tcp_rsp:
780 nvme_tcp_init_recv_ctx(queue);
781 return nvme_tcp_handle_comp(queue, pdu: (void *)queue->pdu);
782 case nvme_tcp_r2t:
783 nvme_tcp_init_recv_ctx(queue);
784 return nvme_tcp_handle_r2t(queue, pdu: (void *)queue->pdu);
785 default:
786 dev_err(queue->ctrl->ctrl.device,
787 "unsupported pdu type (%d)\n", hdr->type);
788 return -EINVAL;
789 }
790}
791
792static inline void nvme_tcp_end_request(struct request *rq, u16 status)
793{
794 union nvme_result res = {};
795
796 if (!nvme_try_complete_req(req: rq, cpu_to_le16(status << 1), result: res))
797 nvme_complete_rq(req: rq);
798}
799
800static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb,
801 unsigned int *offset, size_t *len)
802{
803 struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
804 struct request *rq =
805 nvme_cid_to_rq(tags: nvme_tcp_tagset(queue), command_id: pdu->command_id);
806 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
807
808 while (true) {
809 int recv_len, ret;
810
811 recv_len = min_t(size_t, *len, queue->data_remaining);
812 if (!recv_len)
813 break;
814
815 if (!iov_iter_count(i: &req->iter)) {
816 req->curr_bio = req->curr_bio->bi_next;
817
818 /*
819 * If we don`t have any bios it means that controller
820 * sent more data than we requested, hence error
821 */
822 if (!req->curr_bio) {
823 dev_err(queue->ctrl->ctrl.device,
824 "queue %d no space in request %#x",
825 nvme_tcp_queue_id(queue), rq->tag);
826 nvme_tcp_init_recv_ctx(queue);
827 return -EIO;
828 }
829 nvme_tcp_init_iter(req, ITER_DEST);
830 }
831
832 /* we can read only from what is left in this bio */
833 recv_len = min_t(size_t, recv_len,
834 iov_iter_count(&req->iter));
835
836 if (queue->data_digest)
837 ret = skb_copy_and_hash_datagram_iter(skb, offset: *offset,
838 to: &req->iter, len: recv_len, hash: queue->rcv_hash);
839 else
840 ret = skb_copy_datagram_iter(from: skb, offset: *offset,
841 to: &req->iter, size: recv_len);
842 if (ret) {
843 dev_err(queue->ctrl->ctrl.device,
844 "queue %d failed to copy request %#x data",
845 nvme_tcp_queue_id(queue), rq->tag);
846 return ret;
847 }
848
849 *len -= recv_len;
850 *offset += recv_len;
851 queue->data_remaining -= recv_len;
852 }
853
854 if (!queue->data_remaining) {
855 if (queue->data_digest) {
856 nvme_tcp_ddgst_final(hash: queue->rcv_hash, dgst: &queue->exp_ddgst);
857 queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH;
858 } else {
859 if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
860 nvme_tcp_end_request(rq,
861 le16_to_cpu(req->status));
862 queue->nr_cqe++;
863 }
864 nvme_tcp_init_recv_ctx(queue);
865 }
866 }
867
868 return 0;
869}
870
871static int nvme_tcp_recv_ddgst(struct nvme_tcp_queue *queue,
872 struct sk_buff *skb, unsigned int *offset, size_t *len)
873{
874 struct nvme_tcp_data_pdu *pdu = (void *)queue->pdu;
875 char *ddgst = (char *)&queue->recv_ddgst;
876 size_t recv_len = min_t(size_t, *len, queue->ddgst_remaining);
877 off_t off = NVME_TCP_DIGEST_LENGTH - queue->ddgst_remaining;
878 int ret;
879
880 ret = skb_copy_bits(skb, offset: *offset, to: &ddgst[off], len: recv_len);
881 if (unlikely(ret))
882 return ret;
883
884 queue->ddgst_remaining -= recv_len;
885 *offset += recv_len;
886 *len -= recv_len;
887 if (queue->ddgst_remaining)
888 return 0;
889
890 if (queue->recv_ddgst != queue->exp_ddgst) {
891 struct request *rq = nvme_cid_to_rq(tags: nvme_tcp_tagset(queue),
892 command_id: pdu->command_id);
893 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
894
895 req->status = cpu_to_le16(NVME_SC_DATA_XFER_ERROR);
896
897 dev_err(queue->ctrl->ctrl.device,
898 "data digest error: recv %#x expected %#x\n",
899 le32_to_cpu(queue->recv_ddgst),
900 le32_to_cpu(queue->exp_ddgst));
901 }
902
903 if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) {
904 struct request *rq = nvme_cid_to_rq(tags: nvme_tcp_tagset(queue),
905 command_id: pdu->command_id);
906 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
907
908 nvme_tcp_end_request(rq, le16_to_cpu(req->status));
909 queue->nr_cqe++;
910 }
911
912 nvme_tcp_init_recv_ctx(queue);
913 return 0;
914}
915
916static int nvme_tcp_recv_skb(read_descriptor_t *desc, struct sk_buff *skb,
917 unsigned int offset, size_t len)
918{
919 struct nvme_tcp_queue *queue = desc->arg.data;
920 size_t consumed = len;
921 int result;
922
923 if (unlikely(!queue->rd_enabled))
924 return -EFAULT;
925
926 while (len) {
927 switch (nvme_tcp_recv_state(queue)) {
928 case NVME_TCP_RECV_PDU:
929 result = nvme_tcp_recv_pdu(queue, skb, offset: &offset, len: &len);
930 break;
931 case NVME_TCP_RECV_DATA:
932 result = nvme_tcp_recv_data(queue, skb, offset: &offset, len: &len);
933 break;
934 case NVME_TCP_RECV_DDGST:
935 result = nvme_tcp_recv_ddgst(queue, skb, offset: &offset, len: &len);
936 break;
937 default:
938 result = -EFAULT;
939 }
940 if (result) {
941 dev_err(queue->ctrl->ctrl.device,
942 "receive failed: %d\n", result);
943 queue->rd_enabled = false;
944 nvme_tcp_error_recovery(ctrl: &queue->ctrl->ctrl);
945 return result;
946 }
947 }
948
949 return consumed;
950}
951
952static void nvme_tcp_data_ready(struct sock *sk)
953{
954 struct nvme_tcp_queue *queue;
955
956 trace_sk_data_ready(sk);
957
958 read_lock_bh(&sk->sk_callback_lock);
959 queue = sk->sk_user_data;
960 if (likely(queue && queue->rd_enabled) &&
961 !test_bit(NVME_TCP_Q_POLLING, &queue->flags))
962 queue_work_on(cpu: queue->io_cpu, wq: nvme_tcp_wq, work: &queue->io_work);
963 read_unlock_bh(&sk->sk_callback_lock);
964}
965
966static void nvme_tcp_write_space(struct sock *sk)
967{
968 struct nvme_tcp_queue *queue;
969
970 read_lock_bh(&sk->sk_callback_lock);
971 queue = sk->sk_user_data;
972 if (likely(queue && sk_stream_is_writeable(sk))) {
973 clear_bit(SOCK_NOSPACE, addr: &sk->sk_socket->flags);
974 queue_work_on(cpu: queue->io_cpu, wq: nvme_tcp_wq, work: &queue->io_work);
975 }
976 read_unlock_bh(&sk->sk_callback_lock);
977}
978
979static void nvme_tcp_state_change(struct sock *sk)
980{
981 struct nvme_tcp_queue *queue;
982
983 read_lock_bh(&sk->sk_callback_lock);
984 queue = sk->sk_user_data;
985 if (!queue)
986 goto done;
987
988 switch (sk->sk_state) {
989 case TCP_CLOSE:
990 case TCP_CLOSE_WAIT:
991 case TCP_LAST_ACK:
992 case TCP_FIN_WAIT1:
993 case TCP_FIN_WAIT2:
994 nvme_tcp_error_recovery(ctrl: &queue->ctrl->ctrl);
995 break;
996 default:
997 dev_info(queue->ctrl->ctrl.device,
998 "queue %d socket state %d\n",
999 nvme_tcp_queue_id(queue), sk->sk_state);
1000 }
1001
1002 queue->state_change(sk);
1003done:
1004 read_unlock_bh(&sk->sk_callback_lock);
1005}
1006
1007static inline void nvme_tcp_done_send_req(struct nvme_tcp_queue *queue)
1008{
1009 queue->request = NULL;
1010}
1011
1012static void nvme_tcp_fail_request(struct nvme_tcp_request *req)
1013{
1014 if (nvme_tcp_async_req(req)) {
1015 union nvme_result res = {};
1016
1017 nvme_complete_async_event(ctrl: &req->queue->ctrl->ctrl,
1018 cpu_to_le16(NVME_SC_HOST_PATH_ERROR), res: &res);
1019 } else {
1020 nvme_tcp_end_request(rq: blk_mq_rq_from_pdu(pdu: req),
1021 status: NVME_SC_HOST_PATH_ERROR);
1022 }
1023}
1024
1025static int nvme_tcp_try_send_data(struct nvme_tcp_request *req)
1026{
1027 struct nvme_tcp_queue *queue = req->queue;
1028 int req_data_len = req->data_len;
1029 u32 h2cdata_left = req->h2cdata_left;
1030
1031 while (true) {
1032 struct bio_vec bvec;
1033 struct msghdr msg = {
1034 .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES,
1035 };
1036 struct page *page = nvme_tcp_req_cur_page(req);
1037 size_t offset = nvme_tcp_req_cur_offset(req);
1038 size_t len = nvme_tcp_req_cur_length(req);
1039 bool last = nvme_tcp_pdu_last_send(req, len);
1040 int req_data_sent = req->data_sent;
1041 int ret;
1042
1043 if (last && !queue->data_digest && !nvme_tcp_queue_more(queue))
1044 msg.msg_flags |= MSG_EOR;
1045 else
1046 msg.msg_flags |= MSG_MORE;
1047
1048 if (!sendpage_ok(page))
1049 msg.msg_flags &= ~MSG_SPLICE_PAGES;
1050
1051 bvec_set_page(bv: &bvec, page, len, offset);
1052 iov_iter_bvec(i: &msg.msg_iter, ITER_SOURCE, bvec: &bvec, nr_segs: 1, count: len);
1053 ret = sock_sendmsg(sock: queue->sock, msg: &msg);
1054 if (ret <= 0)
1055 return ret;
1056
1057 if (queue->data_digest)
1058 nvme_tcp_ddgst_update(hash: queue->snd_hash, page,
1059 off: offset, len: ret);
1060
1061 /*
1062 * update the request iterator except for the last payload send
1063 * in the request where we don't want to modify it as we may
1064 * compete with the RX path completing the request.
1065 */
1066 if (req_data_sent + ret < req_data_len)
1067 nvme_tcp_advance_req(req, len: ret);
1068
1069 /* fully successful last send in current PDU */
1070 if (last && ret == len) {
1071 if (queue->data_digest) {
1072 nvme_tcp_ddgst_final(hash: queue->snd_hash,
1073 dgst: &req->ddgst);
1074 req->state = NVME_TCP_SEND_DDGST;
1075 req->offset = 0;
1076 } else {
1077 if (h2cdata_left)
1078 nvme_tcp_setup_h2c_data_pdu(req);
1079 else
1080 nvme_tcp_done_send_req(queue);
1081 }
1082 return 1;
1083 }
1084 }
1085 return -EAGAIN;
1086}
1087
1088static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req)
1089{
1090 struct nvme_tcp_queue *queue = req->queue;
1091 struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
1092 struct bio_vec bvec;
1093 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_SPLICE_PAGES, };
1094 bool inline_data = nvme_tcp_has_inline_data(req);
1095 u8 hdgst = nvme_tcp_hdgst_len(queue);
1096 int len = sizeof(*pdu) + hdgst - req->offset;
1097 int ret;
1098
1099 if (inline_data || nvme_tcp_queue_more(queue))
1100 msg.msg_flags |= MSG_MORE;
1101 else
1102 msg.msg_flags |= MSG_EOR;
1103
1104 if (queue->hdr_digest && !req->offset)
1105 nvme_tcp_hdgst(hash: queue->snd_hash, pdu, len: sizeof(*pdu));
1106
1107 bvec_set_virt(bv: &bvec, vaddr: (void *)pdu + req->offset, len);
1108 iov_iter_bvec(i: &msg.msg_iter, ITER_SOURCE, bvec: &bvec, nr_segs: 1, count: len);
1109 ret = sock_sendmsg(sock: queue->sock, msg: &msg);
1110 if (unlikely(ret <= 0))
1111 return ret;
1112
1113 len -= ret;
1114 if (!len) {
1115 if (inline_data) {
1116 req->state = NVME_TCP_SEND_DATA;
1117 if (queue->data_digest)
1118 crypto_ahash_init(req: queue->snd_hash);
1119 } else {
1120 nvme_tcp_done_send_req(queue);
1121 }
1122 return 1;
1123 }
1124 req->offset += ret;
1125
1126 return -EAGAIN;
1127}
1128
1129static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req)
1130{
1131 struct nvme_tcp_queue *queue = req->queue;
1132 struct nvme_tcp_data_pdu *pdu = nvme_tcp_req_data_pdu(req);
1133 struct bio_vec bvec;
1134 struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_MORE, };
1135 u8 hdgst = nvme_tcp_hdgst_len(queue);
1136 int len = sizeof(*pdu) - req->offset + hdgst;
1137 int ret;
1138
1139 if (queue->hdr_digest && !req->offset)
1140 nvme_tcp_hdgst(hash: queue->snd_hash, pdu, len: sizeof(*pdu));
1141
1142 if (!req->h2cdata_left)
1143 msg.msg_flags |= MSG_SPLICE_PAGES;
1144
1145 bvec_set_virt(bv: &bvec, vaddr: (void *)pdu + req->offset, len);
1146 iov_iter_bvec(i: &msg.msg_iter, ITER_SOURCE, bvec: &bvec, nr_segs: 1, count: len);
1147 ret = sock_sendmsg(sock: queue->sock, msg: &msg);
1148 if (unlikely(ret <= 0))
1149 return ret;
1150
1151 len -= ret;
1152 if (!len) {
1153 req->state = NVME_TCP_SEND_DATA;
1154 if (queue->data_digest)
1155 crypto_ahash_init(req: queue->snd_hash);
1156 return 1;
1157 }
1158 req->offset += ret;
1159
1160 return -EAGAIN;
1161}
1162
1163static int nvme_tcp_try_send_ddgst(struct nvme_tcp_request *req)
1164{
1165 struct nvme_tcp_queue *queue = req->queue;
1166 size_t offset = req->offset;
1167 u32 h2cdata_left = req->h2cdata_left;
1168 int ret;
1169 struct msghdr msg = { .msg_flags = MSG_DONTWAIT };
1170 struct kvec iov = {
1171 .iov_base = (u8 *)&req->ddgst + req->offset,
1172 .iov_len = NVME_TCP_DIGEST_LENGTH - req->offset
1173 };
1174
1175 if (nvme_tcp_queue_more(queue))
1176 msg.msg_flags |= MSG_MORE;
1177 else
1178 msg.msg_flags |= MSG_EOR;
1179
1180 ret = kernel_sendmsg(sock: queue->sock, msg: &msg, vec: &iov, num: 1, len: iov.iov_len);
1181 if (unlikely(ret <= 0))
1182 return ret;
1183
1184 if (offset + ret == NVME_TCP_DIGEST_LENGTH) {
1185 if (h2cdata_left)
1186 nvme_tcp_setup_h2c_data_pdu(req);
1187 else
1188 nvme_tcp_done_send_req(queue);
1189 return 1;
1190 }
1191
1192 req->offset += ret;
1193 return -EAGAIN;
1194}
1195
1196static int nvme_tcp_try_send(struct nvme_tcp_queue *queue)
1197{
1198 struct nvme_tcp_request *req;
1199 unsigned int noreclaim_flag;
1200 int ret = 1;
1201
1202 if (!queue->request) {
1203 queue->request = nvme_tcp_fetch_request(queue);
1204 if (!queue->request)
1205 return 0;
1206 }
1207 req = queue->request;
1208
1209 noreclaim_flag = memalloc_noreclaim_save();
1210 if (req->state == NVME_TCP_SEND_CMD_PDU) {
1211 ret = nvme_tcp_try_send_cmd_pdu(req);
1212 if (ret <= 0)
1213 goto done;
1214 if (!nvme_tcp_has_inline_data(req))
1215 goto out;
1216 }
1217
1218 if (req->state == NVME_TCP_SEND_H2C_PDU) {
1219 ret = nvme_tcp_try_send_data_pdu(req);
1220 if (ret <= 0)
1221 goto done;
1222 }
1223
1224 if (req->state == NVME_TCP_SEND_DATA) {
1225 ret = nvme_tcp_try_send_data(req);
1226 if (ret <= 0)
1227 goto done;
1228 }
1229
1230 if (req->state == NVME_TCP_SEND_DDGST)
1231 ret = nvme_tcp_try_send_ddgst(req);
1232done:
1233 if (ret == -EAGAIN) {
1234 ret = 0;
1235 } else if (ret < 0) {
1236 dev_err(queue->ctrl->ctrl.device,
1237 "failed to send request %d\n", ret);
1238 nvme_tcp_fail_request(req: queue->request);
1239 nvme_tcp_done_send_req(queue);
1240 }
1241out:
1242 memalloc_noreclaim_restore(flags: noreclaim_flag);
1243 return ret;
1244}
1245
1246static int nvme_tcp_try_recv(struct nvme_tcp_queue *queue)
1247{
1248 struct socket *sock = queue->sock;
1249 struct sock *sk = sock->sk;
1250 read_descriptor_t rd_desc;
1251 int consumed;
1252
1253 rd_desc.arg.data = queue;
1254 rd_desc.count = 1;
1255 lock_sock(sk);
1256 queue->nr_cqe = 0;
1257 consumed = sock->ops->read_sock(sk, &rd_desc, nvme_tcp_recv_skb);
1258 release_sock(sk);
1259 return consumed;
1260}
1261
1262static void nvme_tcp_io_work(struct work_struct *w)
1263{
1264 struct nvme_tcp_queue *queue =
1265 container_of(w, struct nvme_tcp_queue, io_work);
1266 unsigned long deadline = jiffies + msecs_to_jiffies(m: 1);
1267
1268 do {
1269 bool pending = false;
1270 int result;
1271
1272 if (mutex_trylock(lock: &queue->send_mutex)) {
1273 result = nvme_tcp_try_send(queue);
1274 mutex_unlock(lock: &queue->send_mutex);
1275 if (result > 0)
1276 pending = true;
1277 else if (unlikely(result < 0))
1278 break;
1279 }
1280
1281 result = nvme_tcp_try_recv(queue);
1282 if (result > 0)
1283 pending = true;
1284 else if (unlikely(result < 0))
1285 return;
1286
1287 if (!pending || !queue->rd_enabled)
1288 return;
1289
1290 } while (!time_after(jiffies, deadline)); /* quota is exhausted */
1291
1292 queue_work_on(cpu: queue->io_cpu, wq: nvme_tcp_wq, work: &queue->io_work);
1293}
1294
1295static void nvme_tcp_free_crypto(struct nvme_tcp_queue *queue)
1296{
1297 struct crypto_ahash *tfm = crypto_ahash_reqtfm(req: queue->rcv_hash);
1298
1299 ahash_request_free(req: queue->rcv_hash);
1300 ahash_request_free(req: queue->snd_hash);
1301 crypto_free_ahash(tfm);
1302}
1303
1304static int nvme_tcp_alloc_crypto(struct nvme_tcp_queue *queue)
1305{
1306 struct crypto_ahash *tfm;
1307
1308 tfm = crypto_alloc_ahash(alg_name: "crc32c", type: 0, CRYPTO_ALG_ASYNC);
1309 if (IS_ERR(ptr: tfm))
1310 return PTR_ERR(ptr: tfm);
1311
1312 queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL);
1313 if (!queue->snd_hash)
1314 goto free_tfm;
1315 ahash_request_set_callback(req: queue->snd_hash, flags: 0, NULL, NULL);
1316
1317 queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL);
1318 if (!queue->rcv_hash)
1319 goto free_snd_hash;
1320 ahash_request_set_callback(req: queue->rcv_hash, flags: 0, NULL, NULL);
1321
1322 return 0;
1323free_snd_hash:
1324 ahash_request_free(req: queue->snd_hash);
1325free_tfm:
1326 crypto_free_ahash(tfm);
1327 return -ENOMEM;
1328}
1329
1330static void nvme_tcp_free_async_req(struct nvme_tcp_ctrl *ctrl)
1331{
1332 struct nvme_tcp_request *async = &ctrl->async_req;
1333
1334 page_frag_free(addr: async->pdu);
1335}
1336
1337static int nvme_tcp_alloc_async_req(struct nvme_tcp_ctrl *ctrl)
1338{
1339 struct nvme_tcp_queue *queue = &ctrl->queues[0];
1340 struct nvme_tcp_request *async = &ctrl->async_req;
1341 u8 hdgst = nvme_tcp_hdgst_len(queue);
1342
1343 async->pdu = page_frag_alloc(nc: &queue->pf_cache,
1344 fragsz: sizeof(struct nvme_tcp_cmd_pdu) + hdgst,
1345 GFP_KERNEL | __GFP_ZERO);
1346 if (!async->pdu)
1347 return -ENOMEM;
1348
1349 async->queue = &ctrl->queues[0];
1350 return 0;
1351}
1352
1353static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid)
1354{
1355 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(ctrl: nctrl);
1356 struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1357 unsigned int noreclaim_flag;
1358
1359 if (!test_and_clear_bit(nr: NVME_TCP_Q_ALLOCATED, addr: &queue->flags))
1360 return;
1361
1362 if (queue->hdr_digest || queue->data_digest)
1363 nvme_tcp_free_crypto(queue);
1364
1365 page_frag_cache_drain(nc: &queue->pf_cache);
1366
1367 noreclaim_flag = memalloc_noreclaim_save();
1368 /* ->sock will be released by fput() */
1369 fput(queue->sock->file);
1370 queue->sock = NULL;
1371 memalloc_noreclaim_restore(flags: noreclaim_flag);
1372
1373 kfree(objp: queue->pdu);
1374 mutex_destroy(lock: &queue->send_mutex);
1375 mutex_destroy(lock: &queue->queue_lock);
1376}
1377
1378static int nvme_tcp_init_connection(struct nvme_tcp_queue *queue)
1379{
1380 struct nvme_tcp_icreq_pdu *icreq;
1381 struct nvme_tcp_icresp_pdu *icresp;
1382 char cbuf[CMSG_LEN(sizeof(char))] = {};
1383 u8 ctype;
1384 struct msghdr msg = {};
1385 struct kvec iov;
1386 bool ctrl_hdgst, ctrl_ddgst;
1387 u32 maxh2cdata;
1388 int ret;
1389
1390 icreq = kzalloc(size: sizeof(*icreq), GFP_KERNEL);
1391 if (!icreq)
1392 return -ENOMEM;
1393
1394 icresp = kzalloc(size: sizeof(*icresp), GFP_KERNEL);
1395 if (!icresp) {
1396 ret = -ENOMEM;
1397 goto free_icreq;
1398 }
1399
1400 icreq->hdr.type = nvme_tcp_icreq;
1401 icreq->hdr.hlen = sizeof(*icreq);
1402 icreq->hdr.pdo = 0;
1403 icreq->hdr.plen = cpu_to_le32(icreq->hdr.hlen);
1404 icreq->pfv = cpu_to_le16(NVME_TCP_PFV_1_0);
1405 icreq->maxr2t = 0; /* single inflight r2t supported */
1406 icreq->hpda = 0; /* no alignment constraint */
1407 if (queue->hdr_digest)
1408 icreq->digest |= NVME_TCP_HDR_DIGEST_ENABLE;
1409 if (queue->data_digest)
1410 icreq->digest |= NVME_TCP_DATA_DIGEST_ENABLE;
1411
1412 iov.iov_base = icreq;
1413 iov.iov_len = sizeof(*icreq);
1414 ret = kernel_sendmsg(sock: queue->sock, msg: &msg, vec: &iov, num: 1, len: iov.iov_len);
1415 if (ret < 0) {
1416 pr_warn("queue %d: failed to send icreq, error %d\n",
1417 nvme_tcp_queue_id(queue), ret);
1418 goto free_icresp;
1419 }
1420
1421 memset(&msg, 0, sizeof(msg));
1422 iov.iov_base = icresp;
1423 iov.iov_len = sizeof(*icresp);
1424 if (nvme_tcp_tls(ctrl: &queue->ctrl->ctrl)) {
1425 msg.msg_control = cbuf;
1426 msg.msg_controllen = sizeof(cbuf);
1427 }
1428 ret = kernel_recvmsg(sock: queue->sock, msg: &msg, vec: &iov, num: 1,
1429 len: iov.iov_len, flags: msg.msg_flags);
1430 if (ret < 0) {
1431 pr_warn("queue %d: failed to receive icresp, error %d\n",
1432 nvme_tcp_queue_id(queue), ret);
1433 goto free_icresp;
1434 }
1435 ret = -ENOTCONN;
1436 if (nvme_tcp_tls(ctrl: &queue->ctrl->ctrl)) {
1437 ctype = tls_get_record_type(sk: queue->sock->sk,
1438 msg: (struct cmsghdr *)cbuf);
1439 if (ctype != TLS_RECORD_TYPE_DATA) {
1440 pr_err("queue %d: unhandled TLS record %d\n",
1441 nvme_tcp_queue_id(queue), ctype);
1442 goto free_icresp;
1443 }
1444 }
1445 ret = -EINVAL;
1446 if (icresp->hdr.type != nvme_tcp_icresp) {
1447 pr_err("queue %d: bad type returned %d\n",
1448 nvme_tcp_queue_id(queue), icresp->hdr.type);
1449 goto free_icresp;
1450 }
1451
1452 if (le32_to_cpu(icresp->hdr.plen) != sizeof(*icresp)) {
1453 pr_err("queue %d: bad pdu length returned %d\n",
1454 nvme_tcp_queue_id(queue), icresp->hdr.plen);
1455 goto free_icresp;
1456 }
1457
1458 if (icresp->pfv != NVME_TCP_PFV_1_0) {
1459 pr_err("queue %d: bad pfv returned %d\n",
1460 nvme_tcp_queue_id(queue), icresp->pfv);
1461 goto free_icresp;
1462 }
1463
1464 ctrl_ddgst = !!(icresp->digest & NVME_TCP_DATA_DIGEST_ENABLE);
1465 if ((queue->data_digest && !ctrl_ddgst) ||
1466 (!queue->data_digest && ctrl_ddgst)) {
1467 pr_err("queue %d: data digest mismatch host: %s ctrl: %s\n",
1468 nvme_tcp_queue_id(queue),
1469 queue->data_digest ? "enabled" : "disabled",
1470 ctrl_ddgst ? "enabled" : "disabled");
1471 goto free_icresp;
1472 }
1473
1474 ctrl_hdgst = !!(icresp->digest & NVME_TCP_HDR_DIGEST_ENABLE);
1475 if ((queue->hdr_digest && !ctrl_hdgst) ||
1476 (!queue->hdr_digest && ctrl_hdgst)) {
1477 pr_err("queue %d: header digest mismatch host: %s ctrl: %s\n",
1478 nvme_tcp_queue_id(queue),
1479 queue->hdr_digest ? "enabled" : "disabled",
1480 ctrl_hdgst ? "enabled" : "disabled");
1481 goto free_icresp;
1482 }
1483
1484 if (icresp->cpda != 0) {
1485 pr_err("queue %d: unsupported cpda returned %d\n",
1486 nvme_tcp_queue_id(queue), icresp->cpda);
1487 goto free_icresp;
1488 }
1489
1490 maxh2cdata = le32_to_cpu(icresp->maxdata);
1491 if ((maxh2cdata % 4) || (maxh2cdata < NVME_TCP_MIN_MAXH2CDATA)) {
1492 pr_err("queue %d: invalid maxh2cdata returned %u\n",
1493 nvme_tcp_queue_id(queue), maxh2cdata);
1494 goto free_icresp;
1495 }
1496 queue->maxh2cdata = maxh2cdata;
1497
1498 ret = 0;
1499free_icresp:
1500 kfree(objp: icresp);
1501free_icreq:
1502 kfree(objp: icreq);
1503 return ret;
1504}
1505
1506static bool nvme_tcp_admin_queue(struct nvme_tcp_queue *queue)
1507{
1508 return nvme_tcp_queue_id(queue) == 0;
1509}
1510
1511static bool nvme_tcp_default_queue(struct nvme_tcp_queue *queue)
1512{
1513 struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1514 int qid = nvme_tcp_queue_id(queue);
1515
1516 return !nvme_tcp_admin_queue(queue) &&
1517 qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT];
1518}
1519
1520static bool nvme_tcp_read_queue(struct nvme_tcp_queue *queue)
1521{
1522 struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1523 int qid = nvme_tcp_queue_id(queue);
1524
1525 return !nvme_tcp_admin_queue(queue) &&
1526 !nvme_tcp_default_queue(queue) &&
1527 qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] +
1528 ctrl->io_queues[HCTX_TYPE_READ];
1529}
1530
1531static bool nvme_tcp_poll_queue(struct nvme_tcp_queue *queue)
1532{
1533 struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1534 int qid = nvme_tcp_queue_id(queue);
1535
1536 return !nvme_tcp_admin_queue(queue) &&
1537 !nvme_tcp_default_queue(queue) &&
1538 !nvme_tcp_read_queue(queue) &&
1539 qid < 1 + ctrl->io_queues[HCTX_TYPE_DEFAULT] +
1540 ctrl->io_queues[HCTX_TYPE_READ] +
1541 ctrl->io_queues[HCTX_TYPE_POLL];
1542}
1543
1544static void nvme_tcp_set_queue_io_cpu(struct nvme_tcp_queue *queue)
1545{
1546 struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1547 int qid = nvme_tcp_queue_id(queue);
1548 int n = 0;
1549
1550 if (nvme_tcp_default_queue(queue))
1551 n = qid - 1;
1552 else if (nvme_tcp_read_queue(queue))
1553 n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] - 1;
1554 else if (nvme_tcp_poll_queue(queue))
1555 n = qid - ctrl->io_queues[HCTX_TYPE_DEFAULT] -
1556 ctrl->io_queues[HCTX_TYPE_READ] - 1;
1557 if (wq_unbound)
1558 queue->io_cpu = WORK_CPU_UNBOUND;
1559 else
1560 queue->io_cpu = cpumask_next_wrap(n: n - 1, cpu_online_mask, start: -1, wrap: false);
1561}
1562
1563static void nvme_tcp_tls_done(void *data, int status, key_serial_t pskid)
1564{
1565 struct nvme_tcp_queue *queue = data;
1566 struct nvme_tcp_ctrl *ctrl = queue->ctrl;
1567 int qid = nvme_tcp_queue_id(queue);
1568 struct key *tls_key;
1569
1570 dev_dbg(ctrl->ctrl.device, "queue %d: TLS handshake done, key %x, status %d\n",
1571 qid, pskid, status);
1572
1573 if (status) {
1574 queue->tls_err = -status;
1575 goto out_complete;
1576 }
1577
1578 tls_key = key_lookup(id: pskid);
1579 if (IS_ERR(ptr: tls_key)) {
1580 dev_warn(ctrl->ctrl.device, "queue %d: Invalid key %x\n",
1581 qid, pskid);
1582 queue->tls_err = -ENOKEY;
1583 } else {
1584 ctrl->ctrl.tls_key = tls_key;
1585 queue->tls_err = 0;
1586 }
1587
1588out_complete:
1589 complete(&queue->tls_complete);
1590}
1591
1592static int nvme_tcp_start_tls(struct nvme_ctrl *nctrl,
1593 struct nvme_tcp_queue *queue,
1594 key_serial_t pskid)
1595{
1596 int qid = nvme_tcp_queue_id(queue);
1597 int ret;
1598 struct tls_handshake_args args;
1599 unsigned long tmo = tls_handshake_timeout * HZ;
1600 key_serial_t keyring = nvme_keyring_id();
1601
1602 dev_dbg(nctrl->device, "queue %d: start TLS with key %x\n",
1603 qid, pskid);
1604 memset(&args, 0, sizeof(args));
1605 args.ta_sock = queue->sock;
1606 args.ta_done = nvme_tcp_tls_done;
1607 args.ta_data = queue;
1608 args.ta_my_peerids[0] = pskid;
1609 args.ta_num_peerids = 1;
1610 if (nctrl->opts->keyring)
1611 keyring = key_serial(key: nctrl->opts->keyring);
1612 args.ta_keyring = keyring;
1613 args.ta_timeout_ms = tls_handshake_timeout * 1000;
1614 queue->tls_err = -EOPNOTSUPP;
1615 init_completion(x: &queue->tls_complete);
1616 ret = tls_client_hello_psk(args: &args, GFP_KERNEL);
1617 if (ret) {
1618 dev_err(nctrl->device, "queue %d: failed to start TLS: %d\n",
1619 qid, ret);
1620 return ret;
1621 }
1622 ret = wait_for_completion_interruptible_timeout(x: &queue->tls_complete, timeout: tmo);
1623 if (ret <= 0) {
1624 if (ret == 0)
1625 ret = -ETIMEDOUT;
1626
1627 dev_err(nctrl->device,
1628 "queue %d: TLS handshake failed, error %d\n",
1629 qid, ret);
1630 tls_handshake_cancel(sk: queue->sock->sk);
1631 } else {
1632 dev_dbg(nctrl->device,
1633 "queue %d: TLS handshake complete, error %d\n",
1634 qid, queue->tls_err);
1635 ret = queue->tls_err;
1636 }
1637 return ret;
1638}
1639
1640static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid,
1641 key_serial_t pskid)
1642{
1643 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(ctrl: nctrl);
1644 struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1645 int ret, rcv_pdu_size;
1646 struct file *sock_file;
1647
1648 mutex_init(&queue->queue_lock);
1649 queue->ctrl = ctrl;
1650 init_llist_head(list: &queue->req_list);
1651 INIT_LIST_HEAD(list: &queue->send_list);
1652 mutex_init(&queue->send_mutex);
1653 INIT_WORK(&queue->io_work, nvme_tcp_io_work);
1654
1655 if (qid > 0)
1656 queue->cmnd_capsule_len = nctrl->ioccsz * 16;
1657 else
1658 queue->cmnd_capsule_len = sizeof(struct nvme_command) +
1659 NVME_TCP_ADMIN_CCSZ;
1660
1661 ret = sock_create(family: ctrl->addr.ss_family, type: SOCK_STREAM,
1662 IPPROTO_TCP, res: &queue->sock);
1663 if (ret) {
1664 dev_err(nctrl->device,
1665 "failed to create socket: %d\n", ret);
1666 goto err_destroy_mutex;
1667 }
1668
1669 sock_file = sock_alloc_file(sock: queue->sock, O_CLOEXEC, NULL);
1670 if (IS_ERR(ptr: sock_file)) {
1671 ret = PTR_ERR(ptr: sock_file);
1672 goto err_destroy_mutex;
1673 }
1674 nvme_tcp_reclassify_socket(sock: queue->sock);
1675
1676 /* Single syn retry */
1677 tcp_sock_set_syncnt(sk: queue->sock->sk, val: 1);
1678
1679 /* Set TCP no delay */
1680 tcp_sock_set_nodelay(sk: queue->sock->sk);
1681
1682 /*
1683 * Cleanup whatever is sitting in the TCP transmit queue on socket
1684 * close. This is done to prevent stale data from being sent should
1685 * the network connection be restored before TCP times out.
1686 */
1687 sock_no_linger(sk: queue->sock->sk);
1688
1689 if (so_priority > 0)
1690 sock_set_priority(sk: queue->sock->sk, priority: so_priority);
1691
1692 /* Set socket type of service */
1693 if (nctrl->opts->tos >= 0)
1694 ip_sock_set_tos(sk: queue->sock->sk, val: nctrl->opts->tos);
1695
1696 /* Set 10 seconds timeout for icresp recvmsg */
1697 queue->sock->sk->sk_rcvtimeo = 10 * HZ;
1698
1699 queue->sock->sk->sk_allocation = GFP_ATOMIC;
1700 queue->sock->sk->sk_use_task_frag = false;
1701 nvme_tcp_set_queue_io_cpu(queue);
1702 queue->request = NULL;
1703 queue->data_remaining = 0;
1704 queue->ddgst_remaining = 0;
1705 queue->pdu_remaining = 0;
1706 queue->pdu_offset = 0;
1707 sk_set_memalloc(sk: queue->sock->sk);
1708
1709 if (nctrl->opts->mask & NVMF_OPT_HOST_TRADDR) {
1710 ret = kernel_bind(sock: queue->sock, addr: (struct sockaddr *)&ctrl->src_addr,
1711 addrlen: sizeof(ctrl->src_addr));
1712 if (ret) {
1713 dev_err(nctrl->device,
1714 "failed to bind queue %d socket %d\n",
1715 qid, ret);
1716 goto err_sock;
1717 }
1718 }
1719
1720 if (nctrl->opts->mask & NVMF_OPT_HOST_IFACE) {
1721 char *iface = nctrl->opts->host_iface;
1722 sockptr_t optval = KERNEL_SOCKPTR(p: iface);
1723
1724 ret = sock_setsockopt(sock: queue->sock, SOL_SOCKET, SO_BINDTODEVICE,
1725 optval, strlen(iface));
1726 if (ret) {
1727 dev_err(nctrl->device,
1728 "failed to bind to interface %s queue %d err %d\n",
1729 iface, qid, ret);
1730 goto err_sock;
1731 }
1732 }
1733
1734 queue->hdr_digest = nctrl->opts->hdr_digest;
1735 queue->data_digest = nctrl->opts->data_digest;
1736 if (queue->hdr_digest || queue->data_digest) {
1737 ret = nvme_tcp_alloc_crypto(queue);
1738 if (ret) {
1739 dev_err(nctrl->device,
1740 "failed to allocate queue %d crypto\n", qid);
1741 goto err_sock;
1742 }
1743 }
1744
1745 rcv_pdu_size = sizeof(struct nvme_tcp_rsp_pdu) +
1746 nvme_tcp_hdgst_len(queue);
1747 queue->pdu = kmalloc(size: rcv_pdu_size, GFP_KERNEL);
1748 if (!queue->pdu) {
1749 ret = -ENOMEM;
1750 goto err_crypto;
1751 }
1752
1753 dev_dbg(nctrl->device, "connecting queue %d\n",
1754 nvme_tcp_queue_id(queue));
1755
1756 ret = kernel_connect(sock: queue->sock, addr: (struct sockaddr *)&ctrl->addr,
1757 addrlen: sizeof(ctrl->addr), flags: 0);
1758 if (ret) {
1759 dev_err(nctrl->device,
1760 "failed to connect socket: %d\n", ret);
1761 goto err_rcv_pdu;
1762 }
1763
1764 /* If PSKs are configured try to start TLS */
1765 if (IS_ENABLED(CONFIG_NVME_TCP_TLS) && pskid) {
1766 ret = nvme_tcp_start_tls(nctrl, queue, pskid);
1767 if (ret)
1768 goto err_init_connect;
1769 }
1770
1771 ret = nvme_tcp_init_connection(queue);
1772 if (ret)
1773 goto err_init_connect;
1774
1775 set_bit(nr: NVME_TCP_Q_ALLOCATED, addr: &queue->flags);
1776
1777 return 0;
1778
1779err_init_connect:
1780 kernel_sock_shutdown(sock: queue->sock, how: SHUT_RDWR);
1781err_rcv_pdu:
1782 kfree(objp: queue->pdu);
1783err_crypto:
1784 if (queue->hdr_digest || queue->data_digest)
1785 nvme_tcp_free_crypto(queue);
1786err_sock:
1787 /* ->sock will be released by fput() */
1788 fput(queue->sock->file);
1789 queue->sock = NULL;
1790err_destroy_mutex:
1791 mutex_destroy(lock: &queue->send_mutex);
1792 mutex_destroy(lock: &queue->queue_lock);
1793 return ret;
1794}
1795
1796static void nvme_tcp_restore_sock_ops(struct nvme_tcp_queue *queue)
1797{
1798 struct socket *sock = queue->sock;
1799
1800 write_lock_bh(&sock->sk->sk_callback_lock);
1801 sock->sk->sk_user_data = NULL;
1802 sock->sk->sk_data_ready = queue->data_ready;
1803 sock->sk->sk_state_change = queue->state_change;
1804 sock->sk->sk_write_space = queue->write_space;
1805 write_unlock_bh(&sock->sk->sk_callback_lock);
1806}
1807
1808static void __nvme_tcp_stop_queue(struct nvme_tcp_queue *queue)
1809{
1810 kernel_sock_shutdown(sock: queue->sock, how: SHUT_RDWR);
1811 nvme_tcp_restore_sock_ops(queue);
1812 cancel_work_sync(work: &queue->io_work);
1813}
1814
1815static void nvme_tcp_stop_queue(struct nvme_ctrl *nctrl, int qid)
1816{
1817 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(ctrl: nctrl);
1818 struct nvme_tcp_queue *queue = &ctrl->queues[qid];
1819
1820 if (!test_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
1821 return;
1822
1823 mutex_lock(&queue->queue_lock);
1824 if (test_and_clear_bit(nr: NVME_TCP_Q_LIVE, addr: &queue->flags))
1825 __nvme_tcp_stop_queue(queue);
1826 mutex_unlock(lock: &queue->queue_lock);
1827}
1828
1829static void nvme_tcp_setup_sock_ops(struct nvme_tcp_queue *queue)
1830{
1831 write_lock_bh(&queue->sock->sk->sk_callback_lock);
1832 queue->sock->sk->sk_user_data = queue;
1833 queue->state_change = queue->sock->sk->sk_state_change;
1834 queue->data_ready = queue->sock->sk->sk_data_ready;
1835 queue->write_space = queue->sock->sk->sk_write_space;
1836 queue->sock->sk->sk_data_ready = nvme_tcp_data_ready;
1837 queue->sock->sk->sk_state_change = nvme_tcp_state_change;
1838 queue->sock->sk->sk_write_space = nvme_tcp_write_space;
1839#ifdef CONFIG_NET_RX_BUSY_POLL
1840 queue->sock->sk->sk_ll_usec = 1;
1841#endif
1842 write_unlock_bh(&queue->sock->sk->sk_callback_lock);
1843}
1844
1845static int nvme_tcp_start_queue(struct nvme_ctrl *nctrl, int idx)
1846{
1847 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(ctrl: nctrl);
1848 struct nvme_tcp_queue *queue = &ctrl->queues[idx];
1849 int ret;
1850
1851 queue->rd_enabled = true;
1852 nvme_tcp_init_recv_ctx(queue);
1853 nvme_tcp_setup_sock_ops(queue);
1854
1855 if (idx)
1856 ret = nvmf_connect_io_queue(ctrl: nctrl, qid: idx);
1857 else
1858 ret = nvmf_connect_admin_queue(ctrl: nctrl);
1859
1860 if (!ret) {
1861 set_bit(nr: NVME_TCP_Q_LIVE, addr: &queue->flags);
1862 } else {
1863 if (test_bit(NVME_TCP_Q_ALLOCATED, &queue->flags))
1864 __nvme_tcp_stop_queue(queue);
1865 dev_err(nctrl->device,
1866 "failed to connect queue: %d ret=%d\n", idx, ret);
1867 }
1868 return ret;
1869}
1870
1871static void nvme_tcp_free_admin_queue(struct nvme_ctrl *ctrl)
1872{
1873 if (to_tcp_ctrl(ctrl)->async_req.pdu) {
1874 cancel_work_sync(work: &ctrl->async_event_work);
1875 nvme_tcp_free_async_req(ctrl: to_tcp_ctrl(ctrl));
1876 to_tcp_ctrl(ctrl)->async_req.pdu = NULL;
1877 }
1878
1879 nvme_tcp_free_queue(nctrl: ctrl, qid: 0);
1880}
1881
1882static void nvme_tcp_free_io_queues(struct nvme_ctrl *ctrl)
1883{
1884 int i;
1885
1886 for (i = 1; i < ctrl->queue_count; i++)
1887 nvme_tcp_free_queue(nctrl: ctrl, qid: i);
1888}
1889
1890static void nvme_tcp_stop_io_queues(struct nvme_ctrl *ctrl)
1891{
1892 int i;
1893
1894 for (i = 1; i < ctrl->queue_count; i++)
1895 nvme_tcp_stop_queue(nctrl: ctrl, qid: i);
1896}
1897
1898static int nvme_tcp_start_io_queues(struct nvme_ctrl *ctrl,
1899 int first, int last)
1900{
1901 int i, ret;
1902
1903 for (i = first; i < last; i++) {
1904 ret = nvme_tcp_start_queue(nctrl: ctrl, idx: i);
1905 if (ret)
1906 goto out_stop_queues;
1907 }
1908
1909 return 0;
1910
1911out_stop_queues:
1912 for (i--; i >= first; i--)
1913 nvme_tcp_stop_queue(nctrl: ctrl, qid: i);
1914 return ret;
1915}
1916
1917static int nvme_tcp_alloc_admin_queue(struct nvme_ctrl *ctrl)
1918{
1919 int ret;
1920 key_serial_t pskid = 0;
1921
1922 if (nvme_tcp_tls(ctrl)) {
1923 if (ctrl->opts->tls_key)
1924 pskid = key_serial(key: ctrl->opts->tls_key);
1925 else
1926 pskid = nvme_tls_psk_default(keyring: ctrl->opts->keyring,
1927 hostnqn: ctrl->opts->host->nqn,
1928 subnqn: ctrl->opts->subsysnqn);
1929 if (!pskid) {
1930 dev_err(ctrl->device, "no valid PSK found\n");
1931 return -ENOKEY;
1932 }
1933 }
1934
1935 ret = nvme_tcp_alloc_queue(nctrl: ctrl, qid: 0, pskid);
1936 if (ret)
1937 return ret;
1938
1939 ret = nvme_tcp_alloc_async_req(ctrl: to_tcp_ctrl(ctrl));
1940 if (ret)
1941 goto out_free_queue;
1942
1943 return 0;
1944
1945out_free_queue:
1946 nvme_tcp_free_queue(nctrl: ctrl, qid: 0);
1947 return ret;
1948}
1949
1950static int __nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
1951{
1952 int i, ret;
1953
1954 if (nvme_tcp_tls(ctrl) && !ctrl->tls_key) {
1955 dev_err(ctrl->device, "no PSK negotiated\n");
1956 return -ENOKEY;
1957 }
1958 for (i = 1; i < ctrl->queue_count; i++) {
1959 ret = nvme_tcp_alloc_queue(nctrl: ctrl, qid: i,
1960 pskid: key_serial(key: ctrl->tls_key));
1961 if (ret)
1962 goto out_free_queues;
1963 }
1964
1965 return 0;
1966
1967out_free_queues:
1968 for (i--; i >= 1; i--)
1969 nvme_tcp_free_queue(nctrl: ctrl, qid: i);
1970
1971 return ret;
1972}
1973
1974static int nvme_tcp_alloc_io_queues(struct nvme_ctrl *ctrl)
1975{
1976 unsigned int nr_io_queues;
1977 int ret;
1978
1979 nr_io_queues = nvmf_nr_io_queues(opts: ctrl->opts);
1980 ret = nvme_set_queue_count(ctrl, count: &nr_io_queues);
1981 if (ret)
1982 return ret;
1983
1984 if (nr_io_queues == 0) {
1985 dev_err(ctrl->device,
1986 "unable to set any I/O queues\n");
1987 return -ENOMEM;
1988 }
1989
1990 ctrl->queue_count = nr_io_queues + 1;
1991 dev_info(ctrl->device,
1992 "creating %d I/O queues.\n", nr_io_queues);
1993
1994 nvmf_set_io_queues(opts: ctrl->opts, nr_io_queues,
1995 io_queues: to_tcp_ctrl(ctrl)->io_queues);
1996 return __nvme_tcp_alloc_io_queues(ctrl);
1997}
1998
1999static void nvme_tcp_destroy_io_queues(struct nvme_ctrl *ctrl, bool remove)
2000{
2001 nvme_tcp_stop_io_queues(ctrl);
2002 if (remove)
2003 nvme_remove_io_tag_set(ctrl);
2004 nvme_tcp_free_io_queues(ctrl);
2005}
2006
2007static int nvme_tcp_configure_io_queues(struct nvme_ctrl *ctrl, bool new)
2008{
2009 int ret, nr_queues;
2010
2011 ret = nvme_tcp_alloc_io_queues(ctrl);
2012 if (ret)
2013 return ret;
2014
2015 if (new) {
2016 ret = nvme_alloc_io_tag_set(ctrl, set: &to_tcp_ctrl(ctrl)->tag_set,
2017 ops: &nvme_tcp_mq_ops,
2018 nr_maps: ctrl->opts->nr_poll_queues ? HCTX_MAX_TYPES : 2,
2019 cmd_size: sizeof(struct nvme_tcp_request));
2020 if (ret)
2021 goto out_free_io_queues;
2022 }
2023
2024 /*
2025 * Only start IO queues for which we have allocated the tagset
2026 * and limitted it to the available queues. On reconnects, the
2027 * queue number might have changed.
2028 */
2029 nr_queues = min(ctrl->tagset->nr_hw_queues + 1, ctrl->queue_count);
2030 ret = nvme_tcp_start_io_queues(ctrl, first: 1, last: nr_queues);
2031 if (ret)
2032 goto out_cleanup_connect_q;
2033
2034 if (!new) {
2035 nvme_start_freeze(ctrl);
2036 nvme_unquiesce_io_queues(ctrl);
2037 if (!nvme_wait_freeze_timeout(ctrl, NVME_IO_TIMEOUT)) {
2038 /*
2039 * If we timed out waiting for freeze we are likely to
2040 * be stuck. Fail the controller initialization just
2041 * to be safe.
2042 */
2043 ret = -ENODEV;
2044 nvme_unfreeze(ctrl);
2045 goto out_wait_freeze_timed_out;
2046 }
2047 blk_mq_update_nr_hw_queues(set: ctrl->tagset,
2048 nr_hw_queues: ctrl->queue_count - 1);
2049 nvme_unfreeze(ctrl);
2050 }
2051
2052 /*
2053 * If the number of queues has increased (reconnect case)
2054 * start all new queues now.
2055 */
2056 ret = nvme_tcp_start_io_queues(ctrl, first: nr_queues,
2057 last: ctrl->tagset->nr_hw_queues + 1);
2058 if (ret)
2059 goto out_wait_freeze_timed_out;
2060
2061 return 0;
2062
2063out_wait_freeze_timed_out:
2064 nvme_quiesce_io_queues(ctrl);
2065 nvme_sync_io_queues(ctrl);
2066 nvme_tcp_stop_io_queues(ctrl);
2067out_cleanup_connect_q:
2068 nvme_cancel_tagset(ctrl);
2069 if (new)
2070 nvme_remove_io_tag_set(ctrl);
2071out_free_io_queues:
2072 nvme_tcp_free_io_queues(ctrl);
2073 return ret;
2074}
2075
2076static void nvme_tcp_destroy_admin_queue(struct nvme_ctrl *ctrl, bool remove)
2077{
2078 nvme_tcp_stop_queue(nctrl: ctrl, qid: 0);
2079 if (remove)
2080 nvme_remove_admin_tag_set(ctrl);
2081 nvme_tcp_free_admin_queue(ctrl);
2082}
2083
2084static int nvme_tcp_configure_admin_queue(struct nvme_ctrl *ctrl, bool new)
2085{
2086 int error;
2087
2088 error = nvme_tcp_alloc_admin_queue(ctrl);
2089 if (error)
2090 return error;
2091
2092 if (new) {
2093 error = nvme_alloc_admin_tag_set(ctrl,
2094 set: &to_tcp_ctrl(ctrl)->admin_tag_set,
2095 ops: &nvme_tcp_admin_mq_ops,
2096 cmd_size: sizeof(struct nvme_tcp_request));
2097 if (error)
2098 goto out_free_queue;
2099 }
2100
2101 error = nvme_tcp_start_queue(nctrl: ctrl, idx: 0);
2102 if (error)
2103 goto out_cleanup_tagset;
2104
2105 error = nvme_enable_ctrl(ctrl);
2106 if (error)
2107 goto out_stop_queue;
2108
2109 nvme_unquiesce_admin_queue(ctrl);
2110
2111 error = nvme_init_ctrl_finish(ctrl, was_suspended: false);
2112 if (error)
2113 goto out_quiesce_queue;
2114
2115 return 0;
2116
2117out_quiesce_queue:
2118 nvme_quiesce_admin_queue(ctrl);
2119 blk_sync_queue(q: ctrl->admin_q);
2120out_stop_queue:
2121 nvme_tcp_stop_queue(nctrl: ctrl, qid: 0);
2122 nvme_cancel_admin_tagset(ctrl);
2123out_cleanup_tagset:
2124 if (new)
2125 nvme_remove_admin_tag_set(ctrl);
2126out_free_queue:
2127 nvme_tcp_free_admin_queue(ctrl);
2128 return error;
2129}
2130
2131static void nvme_tcp_teardown_admin_queue(struct nvme_ctrl *ctrl,
2132 bool remove)
2133{
2134 nvme_quiesce_admin_queue(ctrl);
2135 blk_sync_queue(q: ctrl->admin_q);
2136 nvme_tcp_stop_queue(nctrl: ctrl, qid: 0);
2137 nvme_cancel_admin_tagset(ctrl);
2138 if (remove)
2139 nvme_unquiesce_admin_queue(ctrl);
2140 nvme_tcp_destroy_admin_queue(ctrl, remove);
2141}
2142
2143static void nvme_tcp_teardown_io_queues(struct nvme_ctrl *ctrl,
2144 bool remove)
2145{
2146 if (ctrl->queue_count <= 1)
2147 return;
2148 nvme_quiesce_admin_queue(ctrl);
2149 nvme_quiesce_io_queues(ctrl);
2150 nvme_sync_io_queues(ctrl);
2151 nvme_tcp_stop_io_queues(ctrl);
2152 nvme_cancel_tagset(ctrl);
2153 if (remove)
2154 nvme_unquiesce_io_queues(ctrl);
2155 nvme_tcp_destroy_io_queues(ctrl, remove);
2156}
2157
2158static void nvme_tcp_reconnect_or_remove(struct nvme_ctrl *ctrl)
2159{
2160 enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
2161
2162 /* If we are resetting/deleting then do nothing */
2163 if (state != NVME_CTRL_CONNECTING) {
2164 WARN_ON_ONCE(state == NVME_CTRL_NEW || state == NVME_CTRL_LIVE);
2165 return;
2166 }
2167
2168 if (nvmf_should_reconnect(ctrl)) {
2169 dev_info(ctrl->device, "Reconnecting in %d seconds...\n",
2170 ctrl->opts->reconnect_delay);
2171 queue_delayed_work(wq: nvme_wq, dwork: &to_tcp_ctrl(ctrl)->connect_work,
2172 delay: ctrl->opts->reconnect_delay * HZ);
2173 } else {
2174 dev_info(ctrl->device, "Removing controller...\n");
2175 nvme_delete_ctrl(ctrl);
2176 }
2177}
2178
2179static int nvme_tcp_setup_ctrl(struct nvme_ctrl *ctrl, bool new)
2180{
2181 struct nvmf_ctrl_options *opts = ctrl->opts;
2182 int ret;
2183
2184 ret = nvme_tcp_configure_admin_queue(ctrl, new);
2185 if (ret)
2186 return ret;
2187
2188 if (ctrl->icdoff) {
2189 ret = -EOPNOTSUPP;
2190 dev_err(ctrl->device, "icdoff is not supported!\n");
2191 goto destroy_admin;
2192 }
2193
2194 if (!nvme_ctrl_sgl_supported(ctrl)) {
2195 ret = -EOPNOTSUPP;
2196 dev_err(ctrl->device, "Mandatory sgls are not supported!\n");
2197 goto destroy_admin;
2198 }
2199
2200 if (opts->queue_size > ctrl->sqsize + 1)
2201 dev_warn(ctrl->device,
2202 "queue_size %zu > ctrl sqsize %u, clamping down\n",
2203 opts->queue_size, ctrl->sqsize + 1);
2204
2205 if (ctrl->sqsize + 1 > ctrl->maxcmd) {
2206 dev_warn(ctrl->device,
2207 "sqsize %u > ctrl maxcmd %u, clamping down\n",
2208 ctrl->sqsize + 1, ctrl->maxcmd);
2209 ctrl->sqsize = ctrl->maxcmd - 1;
2210 }
2211
2212 if (ctrl->queue_count > 1) {
2213 ret = nvme_tcp_configure_io_queues(ctrl, new);
2214 if (ret)
2215 goto destroy_admin;
2216 }
2217
2218 if (!nvme_change_ctrl_state(ctrl, new_state: NVME_CTRL_LIVE)) {
2219 /*
2220 * state change failure is ok if we started ctrl delete,
2221 * unless we're during creation of a new controller to
2222 * avoid races with teardown flow.
2223 */
2224 enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
2225
2226 WARN_ON_ONCE(state != NVME_CTRL_DELETING &&
2227 state != NVME_CTRL_DELETING_NOIO);
2228 WARN_ON_ONCE(new);
2229 ret = -EINVAL;
2230 goto destroy_io;
2231 }
2232
2233 nvme_start_ctrl(ctrl);
2234 return 0;
2235
2236destroy_io:
2237 if (ctrl->queue_count > 1) {
2238 nvme_quiesce_io_queues(ctrl);
2239 nvme_sync_io_queues(ctrl);
2240 nvme_tcp_stop_io_queues(ctrl);
2241 nvme_cancel_tagset(ctrl);
2242 nvme_tcp_destroy_io_queues(ctrl, remove: new);
2243 }
2244destroy_admin:
2245 nvme_stop_keep_alive(ctrl);
2246 nvme_tcp_teardown_admin_queue(ctrl, remove: false);
2247 return ret;
2248}
2249
2250static void nvme_tcp_reconnect_ctrl_work(struct work_struct *work)
2251{
2252 struct nvme_tcp_ctrl *tcp_ctrl = container_of(to_delayed_work(work),
2253 struct nvme_tcp_ctrl, connect_work);
2254 struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
2255
2256 ++ctrl->nr_reconnects;
2257
2258 if (nvme_tcp_setup_ctrl(ctrl, new: false))
2259 goto requeue;
2260
2261 dev_info(ctrl->device, "Successfully reconnected (%d attempt)\n",
2262 ctrl->nr_reconnects);
2263
2264 ctrl->nr_reconnects = 0;
2265
2266 return;
2267
2268requeue:
2269 dev_info(ctrl->device, "Failed reconnect attempt %d\n",
2270 ctrl->nr_reconnects);
2271 nvme_tcp_reconnect_or_remove(ctrl);
2272}
2273
2274static void nvme_tcp_error_recovery_work(struct work_struct *work)
2275{
2276 struct nvme_tcp_ctrl *tcp_ctrl = container_of(work,
2277 struct nvme_tcp_ctrl, err_work);
2278 struct nvme_ctrl *ctrl = &tcp_ctrl->ctrl;
2279
2280 nvme_stop_keep_alive(ctrl);
2281 flush_work(work: &ctrl->async_event_work);
2282 nvme_tcp_teardown_io_queues(ctrl, remove: false);
2283 /* unquiesce to fail fast pending requests */
2284 nvme_unquiesce_io_queues(ctrl);
2285 nvme_tcp_teardown_admin_queue(ctrl, remove: false);
2286 nvme_unquiesce_admin_queue(ctrl);
2287 nvme_auth_stop(ctrl);
2288
2289 if (!nvme_change_ctrl_state(ctrl, new_state: NVME_CTRL_CONNECTING)) {
2290 /* state change failure is ok if we started ctrl delete */
2291 enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
2292
2293 WARN_ON_ONCE(state != NVME_CTRL_DELETING &&
2294 state != NVME_CTRL_DELETING_NOIO);
2295 return;
2296 }
2297
2298 nvme_tcp_reconnect_or_remove(ctrl);
2299}
2300
2301static void nvme_tcp_teardown_ctrl(struct nvme_ctrl *ctrl, bool shutdown)
2302{
2303 nvme_tcp_teardown_io_queues(ctrl, remove: shutdown);
2304 nvme_quiesce_admin_queue(ctrl);
2305 nvme_disable_ctrl(ctrl, shutdown);
2306 nvme_tcp_teardown_admin_queue(ctrl, remove: shutdown);
2307}
2308
2309static void nvme_tcp_delete_ctrl(struct nvme_ctrl *ctrl)
2310{
2311 nvme_tcp_teardown_ctrl(ctrl, shutdown: true);
2312}
2313
2314static void nvme_reset_ctrl_work(struct work_struct *work)
2315{
2316 struct nvme_ctrl *ctrl =
2317 container_of(work, struct nvme_ctrl, reset_work);
2318
2319 nvme_stop_ctrl(ctrl);
2320 nvme_tcp_teardown_ctrl(ctrl, shutdown: false);
2321
2322 if (!nvme_change_ctrl_state(ctrl, new_state: NVME_CTRL_CONNECTING)) {
2323 /* state change failure is ok if we started ctrl delete */
2324 enum nvme_ctrl_state state = nvme_ctrl_state(ctrl);
2325
2326 WARN_ON_ONCE(state != NVME_CTRL_DELETING &&
2327 state != NVME_CTRL_DELETING_NOIO);
2328 return;
2329 }
2330
2331 if (nvme_tcp_setup_ctrl(ctrl, new: false))
2332 goto out_fail;
2333
2334 return;
2335
2336out_fail:
2337 ++ctrl->nr_reconnects;
2338 nvme_tcp_reconnect_or_remove(ctrl);
2339}
2340
2341static void nvme_tcp_stop_ctrl(struct nvme_ctrl *ctrl)
2342{
2343 flush_work(work: &to_tcp_ctrl(ctrl)->err_work);
2344 cancel_delayed_work_sync(dwork: &to_tcp_ctrl(ctrl)->connect_work);
2345}
2346
2347static void nvme_tcp_free_ctrl(struct nvme_ctrl *nctrl)
2348{
2349 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(ctrl: nctrl);
2350
2351 if (list_empty(head: &ctrl->list))
2352 goto free_ctrl;
2353
2354 mutex_lock(&nvme_tcp_ctrl_mutex);
2355 list_del(entry: &ctrl->list);
2356 mutex_unlock(lock: &nvme_tcp_ctrl_mutex);
2357
2358 nvmf_free_options(opts: nctrl->opts);
2359free_ctrl:
2360 kfree(objp: ctrl->queues);
2361 kfree(objp: ctrl);
2362}
2363
2364static void nvme_tcp_set_sg_null(struct nvme_command *c)
2365{
2366 struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
2367
2368 sg->addr = 0;
2369 sg->length = 0;
2370 sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
2371 NVME_SGL_FMT_TRANSPORT_A;
2372}
2373
2374static void nvme_tcp_set_sg_inline(struct nvme_tcp_queue *queue,
2375 struct nvme_command *c, u32 data_len)
2376{
2377 struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
2378
2379 sg->addr = cpu_to_le64(queue->ctrl->ctrl.icdoff);
2380 sg->length = cpu_to_le32(data_len);
2381 sg->type = (NVME_SGL_FMT_DATA_DESC << 4) | NVME_SGL_FMT_OFFSET;
2382}
2383
2384static void nvme_tcp_set_sg_host_data(struct nvme_command *c,
2385 u32 data_len)
2386{
2387 struct nvme_sgl_desc *sg = &c->common.dptr.sgl;
2388
2389 sg->addr = 0;
2390 sg->length = cpu_to_le32(data_len);
2391 sg->type = (NVME_TRANSPORT_SGL_DATA_DESC << 4) |
2392 NVME_SGL_FMT_TRANSPORT_A;
2393}
2394
2395static void nvme_tcp_submit_async_event(struct nvme_ctrl *arg)
2396{
2397 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(ctrl: arg);
2398 struct nvme_tcp_queue *queue = &ctrl->queues[0];
2399 struct nvme_tcp_cmd_pdu *pdu = ctrl->async_req.pdu;
2400 struct nvme_command *cmd = &pdu->cmd;
2401 u8 hdgst = nvme_tcp_hdgst_len(queue);
2402
2403 memset(pdu, 0, sizeof(*pdu));
2404 pdu->hdr.type = nvme_tcp_cmd;
2405 if (queue->hdr_digest)
2406 pdu->hdr.flags |= NVME_TCP_F_HDGST;
2407 pdu->hdr.hlen = sizeof(*pdu);
2408 pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst);
2409
2410 cmd->common.opcode = nvme_admin_async_event;
2411 cmd->common.command_id = NVME_AQ_BLK_MQ_DEPTH;
2412 cmd->common.flags |= NVME_CMD_SGL_METABUF;
2413 nvme_tcp_set_sg_null(c: cmd);
2414
2415 ctrl->async_req.state = NVME_TCP_SEND_CMD_PDU;
2416 ctrl->async_req.offset = 0;
2417 ctrl->async_req.curr_bio = NULL;
2418 ctrl->async_req.data_len = 0;
2419
2420 nvme_tcp_queue_request(req: &ctrl->async_req, sync: true, last: true);
2421}
2422
2423static void nvme_tcp_complete_timed_out(struct request *rq)
2424{
2425 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2426 struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl;
2427
2428 nvme_tcp_stop_queue(nctrl: ctrl, qid: nvme_tcp_queue_id(queue: req->queue));
2429 nvmf_complete_timed_out_request(rq);
2430}
2431
2432static enum blk_eh_timer_return nvme_tcp_timeout(struct request *rq)
2433{
2434 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2435 struct nvme_ctrl *ctrl = &req->queue->ctrl->ctrl;
2436 struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
2437 struct nvme_command *cmd = &pdu->cmd;
2438 int qid = nvme_tcp_queue_id(queue: req->queue);
2439
2440 dev_warn(ctrl->device,
2441 "I/O tag %d (%04x) type %d opcode %#x (%s) QID %d timeout\n",
2442 rq->tag, nvme_cid(rq), pdu->hdr.type, cmd->common.opcode,
2443 nvme_fabrics_opcode_str(qid, cmd), qid);
2444
2445 if (nvme_ctrl_state(ctrl) != NVME_CTRL_LIVE) {
2446 /*
2447 * If we are resetting, connecting or deleting we should
2448 * complete immediately because we may block controller
2449 * teardown or setup sequence
2450 * - ctrl disable/shutdown fabrics requests
2451 * - connect requests
2452 * - initialization admin requests
2453 * - I/O requests that entered after unquiescing and
2454 * the controller stopped responding
2455 *
2456 * All other requests should be cancelled by the error
2457 * recovery work, so it's fine that we fail it here.
2458 */
2459 nvme_tcp_complete_timed_out(rq);
2460 return BLK_EH_DONE;
2461 }
2462
2463 /*
2464 * LIVE state should trigger the normal error recovery which will
2465 * handle completing this request.
2466 */
2467 nvme_tcp_error_recovery(ctrl);
2468 return BLK_EH_RESET_TIMER;
2469}
2470
2471static blk_status_t nvme_tcp_map_data(struct nvme_tcp_queue *queue,
2472 struct request *rq)
2473{
2474 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2475 struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
2476 struct nvme_command *c = &pdu->cmd;
2477
2478 c->common.flags |= NVME_CMD_SGL_METABUF;
2479
2480 if (!blk_rq_nr_phys_segments(rq))
2481 nvme_tcp_set_sg_null(c);
2482 else if (rq_data_dir(rq) == WRITE &&
2483 req->data_len <= nvme_tcp_inline_data_size(req))
2484 nvme_tcp_set_sg_inline(queue, c, data_len: req->data_len);
2485 else
2486 nvme_tcp_set_sg_host_data(c, data_len: req->data_len);
2487
2488 return 0;
2489}
2490
2491static blk_status_t nvme_tcp_setup_cmd_pdu(struct nvme_ns *ns,
2492 struct request *rq)
2493{
2494 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2495 struct nvme_tcp_cmd_pdu *pdu = nvme_tcp_req_cmd_pdu(req);
2496 struct nvme_tcp_queue *queue = req->queue;
2497 u8 hdgst = nvme_tcp_hdgst_len(queue), ddgst = 0;
2498 blk_status_t ret;
2499
2500 ret = nvme_setup_cmd(ns, req: rq);
2501 if (ret)
2502 return ret;
2503
2504 req->state = NVME_TCP_SEND_CMD_PDU;
2505 req->status = cpu_to_le16(NVME_SC_SUCCESS);
2506 req->offset = 0;
2507 req->data_sent = 0;
2508 req->pdu_len = 0;
2509 req->pdu_sent = 0;
2510 req->h2cdata_left = 0;
2511 req->data_len = blk_rq_nr_phys_segments(rq) ?
2512 blk_rq_payload_bytes(rq) : 0;
2513 req->curr_bio = rq->bio;
2514 if (req->curr_bio && req->data_len)
2515 nvme_tcp_init_iter(req, rq_data_dir(rq));
2516
2517 if (rq_data_dir(rq) == WRITE &&
2518 req->data_len <= nvme_tcp_inline_data_size(req))
2519 req->pdu_len = req->data_len;
2520
2521 pdu->hdr.type = nvme_tcp_cmd;
2522 pdu->hdr.flags = 0;
2523 if (queue->hdr_digest)
2524 pdu->hdr.flags |= NVME_TCP_F_HDGST;
2525 if (queue->data_digest && req->pdu_len) {
2526 pdu->hdr.flags |= NVME_TCP_F_DDGST;
2527 ddgst = nvme_tcp_ddgst_len(queue);
2528 }
2529 pdu->hdr.hlen = sizeof(*pdu);
2530 pdu->hdr.pdo = req->pdu_len ? pdu->hdr.hlen + hdgst : 0;
2531 pdu->hdr.plen =
2532 cpu_to_le32(pdu->hdr.hlen + hdgst + req->pdu_len + ddgst);
2533
2534 ret = nvme_tcp_map_data(queue, rq);
2535 if (unlikely(ret)) {
2536 nvme_cleanup_cmd(req: rq);
2537 dev_err(queue->ctrl->ctrl.device,
2538 "Failed to map data (%d)\n", ret);
2539 return ret;
2540 }
2541
2542 return 0;
2543}
2544
2545static void nvme_tcp_commit_rqs(struct blk_mq_hw_ctx *hctx)
2546{
2547 struct nvme_tcp_queue *queue = hctx->driver_data;
2548
2549 if (!llist_empty(head: &queue->req_list))
2550 queue_work_on(cpu: queue->io_cpu, wq: nvme_tcp_wq, work: &queue->io_work);
2551}
2552
2553static blk_status_t nvme_tcp_queue_rq(struct blk_mq_hw_ctx *hctx,
2554 const struct blk_mq_queue_data *bd)
2555{
2556 struct nvme_ns *ns = hctx->queue->queuedata;
2557 struct nvme_tcp_queue *queue = hctx->driver_data;
2558 struct request *rq = bd->rq;
2559 struct nvme_tcp_request *req = blk_mq_rq_to_pdu(rq);
2560 bool queue_ready = test_bit(NVME_TCP_Q_LIVE, &queue->flags);
2561 blk_status_t ret;
2562
2563 if (!nvme_check_ready(ctrl: &queue->ctrl->ctrl, rq, queue_live: queue_ready))
2564 return nvme_fail_nonready_command(ctrl: &queue->ctrl->ctrl, req: rq);
2565
2566 ret = nvme_tcp_setup_cmd_pdu(ns, rq);
2567 if (unlikely(ret))
2568 return ret;
2569
2570 nvme_start_request(rq);
2571
2572 nvme_tcp_queue_request(req, sync: true, last: bd->last);
2573
2574 return BLK_STS_OK;
2575}
2576
2577static void nvme_tcp_map_queues(struct blk_mq_tag_set *set)
2578{
2579 struct nvme_tcp_ctrl *ctrl = to_tcp_ctrl(ctrl: set->driver_data);
2580
2581 nvmf_map_queues(set, ctrl: &ctrl->ctrl, io_queues: ctrl->io_queues);
2582}
2583
2584static int nvme_tcp_poll(struct blk_mq_hw_ctx *hctx, struct io_comp_batch *iob)
2585{
2586 struct nvme_tcp_queue *queue = hctx->driver_data;
2587 struct sock *sk = queue->sock->sk;
2588
2589 if (!test_bit(NVME_TCP_Q_LIVE, &queue->flags))
2590 return 0;
2591
2592 set_bit(nr: NVME_TCP_Q_POLLING, addr: &queue->flags);
2593 if (sk_can_busy_loop(sk) && skb_queue_empty_lockless(list: &sk->sk_receive_queue))
2594 sk_busy_loop(sk, nonblock: true);
2595 nvme_tcp_try_recv(queue);
2596 clear_bit(nr: NVME_TCP_Q_POLLING, addr: &queue->flags);
2597 return queue->nr_cqe;
2598}
2599
2600static int nvme_tcp_get_address(struct nvme_ctrl *ctrl, char *buf, int size)
2601{
2602 struct nvme_tcp_queue *queue = &to_tcp_ctrl(ctrl)->queues[0];
2603 struct sockaddr_storage src_addr;
2604 int ret, len;
2605
2606 len = nvmf_get_address(ctrl, buf, size);
2607
2608 mutex_lock(&queue->queue_lock);
2609
2610 if (!test_bit(NVME_TCP_Q_LIVE, &queue->flags))
2611 goto done;
2612 ret = kernel_getsockname(sock: queue->sock, addr: (struct sockaddr *)&src_addr);
2613 if (ret > 0) {
2614 if (len > 0)
2615 len--; /* strip trailing newline */
2616 len += scnprintf(buf: buf + len, size: size - len, fmt: "%ssrc_addr=%pISc\n",
2617 (len) ? "," : "", &src_addr);
2618 }
2619done:
2620 mutex_unlock(lock: &queue->queue_lock);
2621
2622 return len;
2623}
2624
2625static const struct blk_mq_ops nvme_tcp_mq_ops = {
2626 .queue_rq = nvme_tcp_queue_rq,
2627 .commit_rqs = nvme_tcp_commit_rqs,
2628 .complete = nvme_complete_rq,
2629 .init_request = nvme_tcp_init_request,
2630 .exit_request = nvme_tcp_exit_request,
2631 .init_hctx = nvme_tcp_init_hctx,
2632 .timeout = nvme_tcp_timeout,
2633 .map_queues = nvme_tcp_map_queues,
2634 .poll = nvme_tcp_poll,
2635};
2636
2637static const struct blk_mq_ops nvme_tcp_admin_mq_ops = {
2638 .queue_rq = nvme_tcp_queue_rq,
2639 .complete = nvme_complete_rq,
2640 .init_request = nvme_tcp_init_request,
2641 .exit_request = nvme_tcp_exit_request,
2642 .init_hctx = nvme_tcp_init_admin_hctx,
2643 .timeout = nvme_tcp_timeout,
2644};
2645
2646static const struct nvme_ctrl_ops nvme_tcp_ctrl_ops = {
2647 .name = "tcp",
2648 .module = THIS_MODULE,
2649 .flags = NVME_F_FABRICS | NVME_F_BLOCKING,
2650 .reg_read32 = nvmf_reg_read32,
2651 .reg_read64 = nvmf_reg_read64,
2652 .reg_write32 = nvmf_reg_write32,
2653 .free_ctrl = nvme_tcp_free_ctrl,
2654 .submit_async_event = nvme_tcp_submit_async_event,
2655 .delete_ctrl = nvme_tcp_delete_ctrl,
2656 .get_address = nvme_tcp_get_address,
2657 .stop_ctrl = nvme_tcp_stop_ctrl,
2658};
2659
2660static bool
2661nvme_tcp_existing_controller(struct nvmf_ctrl_options *opts)
2662{
2663 struct nvme_tcp_ctrl *ctrl;
2664 bool found = false;
2665
2666 mutex_lock(&nvme_tcp_ctrl_mutex);
2667 list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list) {
2668 found = nvmf_ip_options_match(ctrl: &ctrl->ctrl, opts);
2669 if (found)
2670 break;
2671 }
2672 mutex_unlock(lock: &nvme_tcp_ctrl_mutex);
2673
2674 return found;
2675}
2676
2677static struct nvme_ctrl *nvme_tcp_create_ctrl(struct device *dev,
2678 struct nvmf_ctrl_options *opts)
2679{
2680 struct nvme_tcp_ctrl *ctrl;
2681 int ret;
2682
2683 ctrl = kzalloc(size: sizeof(*ctrl), GFP_KERNEL);
2684 if (!ctrl)
2685 return ERR_PTR(error: -ENOMEM);
2686
2687 INIT_LIST_HEAD(list: &ctrl->list);
2688 ctrl->ctrl.opts = opts;
2689 ctrl->ctrl.queue_count = opts->nr_io_queues + opts->nr_write_queues +
2690 opts->nr_poll_queues + 1;
2691 ctrl->ctrl.sqsize = opts->queue_size - 1;
2692 ctrl->ctrl.kato = opts->kato;
2693
2694 INIT_DELAYED_WORK(&ctrl->connect_work,
2695 nvme_tcp_reconnect_ctrl_work);
2696 INIT_WORK(&ctrl->err_work, nvme_tcp_error_recovery_work);
2697 INIT_WORK(&ctrl->ctrl.reset_work, nvme_reset_ctrl_work);
2698
2699 if (!(opts->mask & NVMF_OPT_TRSVCID)) {
2700 opts->trsvcid =
2701 kstrdup(__stringify(NVME_TCP_DISC_PORT), GFP_KERNEL);
2702 if (!opts->trsvcid) {
2703 ret = -ENOMEM;
2704 goto out_free_ctrl;
2705 }
2706 opts->mask |= NVMF_OPT_TRSVCID;
2707 }
2708
2709 ret = inet_pton_with_scope(net: &init_net, AF_UNSPEC,
2710 src: opts->traddr, port: opts->trsvcid, addr: &ctrl->addr);
2711 if (ret) {
2712 pr_err("malformed address passed: %s:%s\n",
2713 opts->traddr, opts->trsvcid);
2714 goto out_free_ctrl;
2715 }
2716
2717 if (opts->mask & NVMF_OPT_HOST_TRADDR) {
2718 ret = inet_pton_with_scope(net: &init_net, AF_UNSPEC,
2719 src: opts->host_traddr, NULL, addr: &ctrl->src_addr);
2720 if (ret) {
2721 pr_err("malformed src address passed: %s\n",
2722 opts->host_traddr);
2723 goto out_free_ctrl;
2724 }
2725 }
2726
2727 if (opts->mask & NVMF_OPT_HOST_IFACE) {
2728 if (!__dev_get_by_name(net: &init_net, name: opts->host_iface)) {
2729 pr_err("invalid interface passed: %s\n",
2730 opts->host_iface);
2731 ret = -ENODEV;
2732 goto out_free_ctrl;
2733 }
2734 }
2735
2736 if (!opts->duplicate_connect && nvme_tcp_existing_controller(opts)) {
2737 ret = -EALREADY;
2738 goto out_free_ctrl;
2739 }
2740
2741 ctrl->queues = kcalloc(n: ctrl->ctrl.queue_count, size: sizeof(*ctrl->queues),
2742 GFP_KERNEL);
2743 if (!ctrl->queues) {
2744 ret = -ENOMEM;
2745 goto out_free_ctrl;
2746 }
2747
2748 ret = nvme_init_ctrl(ctrl: &ctrl->ctrl, dev, ops: &nvme_tcp_ctrl_ops, quirks: 0);
2749 if (ret)
2750 goto out_kfree_queues;
2751
2752 if (!nvme_change_ctrl_state(ctrl: &ctrl->ctrl, new_state: NVME_CTRL_CONNECTING)) {
2753 WARN_ON_ONCE(1);
2754 ret = -EINTR;
2755 goto out_uninit_ctrl;
2756 }
2757
2758 ret = nvme_tcp_setup_ctrl(ctrl: &ctrl->ctrl, new: true);
2759 if (ret)
2760 goto out_uninit_ctrl;
2761
2762 dev_info(ctrl->ctrl.device, "new ctrl: NQN \"%s\", addr %pISp, hostnqn: %s\n",
2763 nvmf_ctrl_subsysnqn(&ctrl->ctrl), &ctrl->addr, opts->host->nqn);
2764
2765 mutex_lock(&nvme_tcp_ctrl_mutex);
2766 list_add_tail(new: &ctrl->list, head: &nvme_tcp_ctrl_list);
2767 mutex_unlock(lock: &nvme_tcp_ctrl_mutex);
2768
2769 return &ctrl->ctrl;
2770
2771out_uninit_ctrl:
2772 nvme_uninit_ctrl(ctrl: &ctrl->ctrl);
2773 nvme_put_ctrl(ctrl: &ctrl->ctrl);
2774 if (ret > 0)
2775 ret = -EIO;
2776 return ERR_PTR(error: ret);
2777out_kfree_queues:
2778 kfree(objp: ctrl->queues);
2779out_free_ctrl:
2780 kfree(objp: ctrl);
2781 return ERR_PTR(error: ret);
2782}
2783
2784static struct nvmf_transport_ops nvme_tcp_transport = {
2785 .name = "tcp",
2786 .module = THIS_MODULE,
2787 .required_opts = NVMF_OPT_TRADDR,
2788 .allowed_opts = NVMF_OPT_TRSVCID | NVMF_OPT_RECONNECT_DELAY |
2789 NVMF_OPT_HOST_TRADDR | NVMF_OPT_CTRL_LOSS_TMO |
2790 NVMF_OPT_HDR_DIGEST | NVMF_OPT_DATA_DIGEST |
2791 NVMF_OPT_NR_WRITE_QUEUES | NVMF_OPT_NR_POLL_QUEUES |
2792 NVMF_OPT_TOS | NVMF_OPT_HOST_IFACE | NVMF_OPT_TLS |
2793 NVMF_OPT_KEYRING | NVMF_OPT_TLS_KEY,
2794 .create_ctrl = nvme_tcp_create_ctrl,
2795};
2796
2797static int __init nvme_tcp_init_module(void)
2798{
2799 unsigned int wq_flags = WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_SYSFS;
2800
2801 BUILD_BUG_ON(sizeof(struct nvme_tcp_hdr) != 8);
2802 BUILD_BUG_ON(sizeof(struct nvme_tcp_cmd_pdu) != 72);
2803 BUILD_BUG_ON(sizeof(struct nvme_tcp_data_pdu) != 24);
2804 BUILD_BUG_ON(sizeof(struct nvme_tcp_rsp_pdu) != 24);
2805 BUILD_BUG_ON(sizeof(struct nvme_tcp_r2t_pdu) != 24);
2806 BUILD_BUG_ON(sizeof(struct nvme_tcp_icreq_pdu) != 128);
2807 BUILD_BUG_ON(sizeof(struct nvme_tcp_icresp_pdu) != 128);
2808 BUILD_BUG_ON(sizeof(struct nvme_tcp_term_pdu) != 24);
2809
2810 if (wq_unbound)
2811 wq_flags |= WQ_UNBOUND;
2812
2813 nvme_tcp_wq = alloc_workqueue(fmt: "nvme_tcp_wq", flags: wq_flags, max_active: 0);
2814 if (!nvme_tcp_wq)
2815 return -ENOMEM;
2816
2817 nvmf_register_transport(ops: &nvme_tcp_transport);
2818 return 0;
2819}
2820
2821static void __exit nvme_tcp_cleanup_module(void)
2822{
2823 struct nvme_tcp_ctrl *ctrl;
2824
2825 nvmf_unregister_transport(ops: &nvme_tcp_transport);
2826
2827 mutex_lock(&nvme_tcp_ctrl_mutex);
2828 list_for_each_entry(ctrl, &nvme_tcp_ctrl_list, list)
2829 nvme_delete_ctrl(ctrl: &ctrl->ctrl);
2830 mutex_unlock(lock: &nvme_tcp_ctrl_mutex);
2831 flush_workqueue(nvme_delete_wq);
2832
2833 destroy_workqueue(wq: nvme_tcp_wq);
2834}
2835
2836module_init(nvme_tcp_init_module);
2837module_exit(nvme_tcp_cleanup_module);
2838
2839MODULE_DESCRIPTION("NVMe host TCP transport driver");
2840MODULE_LICENSE("GPL v2");
2841

source code of linux/drivers/nvme/host/tcp.c