As mentioned in the previous article, in the Virtio device initialization process, will be created through SETUP_VP Virtqueue, the current Virtqueue queue through vring to actually work, we can use Virtqueue as an interface class, and Vring_ Virtqueue as an implementation of this interface
/** * virtqueue-a queue to register buffers for sending or receiving. * @list: The chain of virtqueues for this device * @callback: The function to call when buffers is consumed (can be NULL) . * @name: The name of this virtqueue (mainly for debugging) * @vdev: The virtio device This queue is created for. * @priv: A pointer for the VIRTQUEUE implementation to use. */struct virtqueue { struct list_head list; void (*callback) (struct virtqueue *vq); const char *name; struct Virtio_device *vdev; void *priv;};
For PCI devices, Priv points to a virtio_pci_vq_info structure where VQ points to the interface definition Virtqueue,queue the actual address to the ring, Msix_vector is the interrupt number, etc.
struct virtio_pci_vq_info{/ * The actual virtqueue */ struct virtqueue *vq; /* The number of entries in the queue */ int num; /* The index of the queue */ int queue_index; /* The virtual address of the ring queue */ void *queue; /* The list node for the virtqueues list */ struct list_head node; /* Msi-x vector (or none) */ unsigned msix_vector;};
A VRING_VIRTQUEUE structure is defined as follows
struct vring_virtqueue{struct virtqueue vq; /* Virtqueue Interface class definition */* Actual memory layout for this queue */struct vring vring; /* vring Memory Layout pointer */* Other side have made a mess, don ' t try any more. */bool broken; /* Host supports indirect buffers */bool indirect; /* Transport feature bit, support indirect buffer */* Host publishes avail event IDX */BOOL event; /* Transport feature bit, support event IDX interrupt/notify */* Number of free buffers */unsigned int num_free; /* Vring desc contains the remaining free buffer number, and free buffer is a list */* At the beginning of Free_head, and the Head of the free buffer list. */unsigned int free_head; /* vring desc FREE Buffer Head Index */* number we ' ve added since last sync. */unsigned int num_added; /* from the last sync to now increase the number of times, note here is the number of times, not increased buffer number *//* Last used Index we ' ve seen. */U16 Last_used_idx; /* How to notify other side. Fixme:commonalize hcalls! */void (*notify) (struct virtqueue *vq), #ifdef DEBUG/* they ' re supposedTo lock for us. */unsigned int in_use; #endif/* Tokens for callbacks. */void *data[]; /* Token array, number with vring desc */};
The Virtio specification defines several standard operations for virtqueue, e.g.
struct Virtqueue *vring_new_virtqueue (unsigned int num, unsigned int vring_align, struct Virtio_device *vdev, void *pages, void (*notify) (struct virtqueue *), void (*callback) (struct virtqueue *), const char *name) {struct VRING_VIRTQUEU e *vq; unsigned int i; /* We assume num is a power of 2. */if (num & (num-1)) {Dev_warn (&vdev->dev, "bad virtqueue length%u\n", num); return NULL; } VQ = Kmalloc (sizeof (*VQ) + sizeof (void *) *num, gfp_kernel); /* Vring_virtqueue + the size of the data array */if (!VQ) return NULL; Vring_init (&vq->vring, num, pages, vring_align); Vq->vq.callback = callback; Vq->vq.vdev = Vdev; Vq->vq.name = name; Vq->notify = notify; Vq->broken = false; Vq->last_used_idx = 0; vq->num_added = 0; List_add_tail (&vq->vq.list, &vdev->vqs); #ifdeF DEBUG vq->in_use = false; #endif vq->indirect = Virtio_has_feature (Vdev, Virtio_ring_f_indirect_desc); /* Support Indirect buffer */vq->event = Virtio_has_feature (Vdev, VIRTIO_RING_F_EVENT_IDX); /* Whether to trigger interrupt time via event IDX */* No callback? Tell and side not to bother us. */if (!callback) vq->vring.avail->flags |= vring_avail_f_no_interrupt; /* If callback is empty, turn off interrupt */* Put everything in free lists. */vq->num_free = num; Vq->free_head = 0; /* IDX 0 as the head of the free buffer list */for (i = 0; i < num-1; i++) {/* vring_desc[0], vring_desc[1] connected sequentially to a list */vq- >vring.desc[i].next = i+1; Vq->data[i] = NULL; } Vq->data[i] = NULL; return &VQ->VQ;}
Vring_new_virtqueue creates a vring_virtqueue, where the memory allocation size is sizeof (struct vring_virtqueue) + num * sizeof (void *), The incoming pages memory is Vring's layout memories
/* Virtqueue_add_buf:expose buffer to other end * vq:the struct virtqueue we ' re talking about. * Sg:the Description of the buffer (s). * out_num:the number of SG readable by other side * in_num:the number of SG which is writable (after readable ones) * Data:the token identifying the buffer. * Returns remaining capacity of the queue (SG segments) or a negative error. */int virtqueue_add_buf (struct virtqueue *_vq, struct scatterlist sg[], unsigned int out, Unsig Ned int in, void *data) {struct Vring_virtqueue *vq = To_vvq (_VQ); unsigned int i, avail, head, Uninitialized_var (prev); Start_use (VQ); bug_on (data = = NULL); /* If the host supports indirect descriptor tables, and we have multiple * buffers, then go indirect. Fixme:tune this threshold */if (Vq->indirect && (out + in) > 1 && vq->num_free) {Hea D = vring_add_indirect (VQ, SG, out, in); if (head! = vq->vring.num) Goto Add_head; } bug_on (out + in > Vq->vring.num); Bug_on (out + in = = 0); if (Vq->num_free < out + in) {Pr_debug ("Can ' t-add buf len%i-avail =%i\n", out + in, vq->n Um_free); /* Fixme:for Historical reasons, we force a notify here if * There is outgoing parts to the buffer. Presumably the * host should service the ring ASAP. */if (out) vq->notify (&VQ->VQ); End_use (VQ); RETURN-ENOSPC; }/* We ' some buffers from the free list. */Vq->num_free-= out + in; Head = vq->free_head; /* Start with vring_desc[vq->free_head], fill in the Vring_desc entry, and add the new free buffer */for (i = vq->free_head; out; i = Vq->vrin G.desc[i].next, out--) {vq->vring.desc[i].flags = Vring_desc_f_next; VQ->VRING.DESC[I].ADDR = Sg_phys (SG); Vq->vring.desc[i].len = sg->length; prev = i; sg++; } for (; in; i = VQ->VRING.DEsc[i].next, in--) {vq->vring.desc[i].flags = vring_desc_f_next| Vring_desc_f_write; VQ->VRING.DESC[I].ADDR = Sg_phys (SG); Vq->vring.desc[i].len = sg->length; prev = i; sg++; }/* Last one doesn ' t continue. */Vq->vring.desc[prev].flags &= ~vring_desc_f_next; /* The last VRING_DESC item does not have next flag */* Update free pointer */vq->free_head = i; /* After moving vq->free_head, the newly added buffer is ready to be passed to avail ring */add_head:/* Set token. */vq->data[head] = data; /* Fill in the token with data to Vq->data[head] */* put entry in available array (but don ' t update avail->idx until they * Do sync). Fixme:avoid modulus here? */avail = (vq->vring.avail->idx + vq->num_added++)% vq->vring.num; Vq->vring.avail->ring[avail] = head; Pr_debug ("Added buffer head%i to%p\n", head, VQ); End_use (VQ); /* If we ' re indirect, we can fit many (assuming not OOM). */if (vq->indirect) return vq->num_free?vq->vring.num:0; return vq->num_free;}
Virtqueue_add_buf the incoming scatterlist into the free buffer list of VRING_DESC and updates the entry of the avail ring's idx, pointing to the newly added free buffer link header. Because there may be multiple VIRTQUEUE_ADD_BUF calls before the front-end idx is synchronized, Vring_virtqueue uses a num_added to represent the number of times virtqueue_add_buf is called, e.g. look at this code.
/* Put entry in available array (but don ' t update avail->idx until they * do sync). Fixme:avoid modulus here? * /avail = (vq->vring.avail->idx + vq->num_added++)% vq->vring.num; Vq->vring.avail->ring[avail] = head;
vq->num_added = 0,avail = Vq->vring.avail->idx after initialization, so VRING_AVAIL.RING[IDX] points to the newly added free buffer's list header. But at this point the VRING.AVAIL->IDX remains the same, allowing vq->num_added to increase by only 1, so the next time the virtqueue_add_buf is called, Vring_avail.ring[idx+1] will point to the newly added free Buffer chain header, and so on.
If vring supports indirect, the new free buffer will be a lot easier to implement by Vring_add_indirect
static int vring_add_indirect (struct vring_virtqueue *vq, struct scatterlist sg[], Unsig Ned int out, unsigned int in) {struct VRING_DESC *desc; unsigned head; int i; desc = Kmalloc (out + in) * sizeof (struct vring_desc), gfp_atomic); /* Assign In+out VRING_DESC items */if (!DESC) return vq->vring.num; /* Transfer entries from the SG list to the indirect page */for (i = 0; i < out; i++) {desc[i].flags = VR Ing_desc_f_next; DESC[I].ADDR = Sg_phys (SG); Desc[i].len = sg->length; Desc[i].next = i+1; sg++; } for (; I < (out + in); i++) {desc[i].flags = vring_desc_f_next| Vring_desc_f_write; DESC[I].ADDR = Sg_phys (SG); Desc[i].len = sg->length; Desc[i].next = i+1; sg++; }/* Last one doesn ' t continue. */Desc[i-1].flags &= ~vring_desc_f_next; Desc[i-1].next = 0; /* We ' re about-use a buffer */vq->num_free--; /* For Vring_virtqueue, only one VRING_DESC item is used */* use a single buffer which doesn ' t continue */head = Vq->free_head ; Vq->vring.desc[head].flags = Vring_desc_f_indirect; VQ->VRING.DESC[HEAD].ADDR = Virt_to_phys (DESC); Vq->vring.desc[head].len = i * sizeof (struct VRING_DESC); /* Update free pointer */vq->free_head = vq->vring.desc[head].next; /* Free_head Move backward one item */return head;
If it is indirect, VRING_DESC will only add a table entry, pointing to an indirect vring_desc array.
/** * virtqueue_kick-update after Add_buf * @vq: the struct virtqueue * * Once one or more virtqueue_add_buf calls, INV Oke the kick * the other side. * * Caller must ensure we don ' t call this with other Virtqueue * operations at the same time (except where noted). */void Virtqueue_kick (struct virtqueue *vq) {if (Virtqueue_kick_prepare (VQ)) virtqueue_notify (VQ);} BOOL Virtqueue_kick_prepare (struct virtqueue *_vq) {struct Vring_virtqueue *vq = To_vvq (_VQ); U16 New, Old; BOOL Needs_kick; Start_use (VQ); /* descriptors and available array need to BES set before we expose the * new available array entries. */VIRTIO_WMB (); Old = vq->vring.avail->idx; New = VQ->VRING.AVAIL->IDX = old + vq->num_added; vq->num_added = 0; /* need to update avail index before checking if we should notify */VIRTIO_MB (); if (vq->event) {Needs_kick = Vring_need_event (Vring_avail_event (&vq->vring), NE W, old); } else {Needs_kick =! ( Vq->vring.used->flags & Vring_used_f_no_notify); } end_use (VQ); return Needs_kick;}
The Virtqueue_kick is used to inform the qemu/vhost-side avail ring that there is an update, where Virtqueue_kick_prepare is used to calculate whether kick is required, and virtqueue_notify by writing Virtio The Queue_notify field of the BAR0 configuration space is generated vmexit thus captured by Qemu/vhost
Virtqueue_kick_prepare If you support VIRTIO_RING_F_EVENT_IDX feature, you need to calculate VQ->VRING.AVAIL->IDX changes while comparing avail What does this used_event_idx do with the used_event_idx in the ring? Say the popular point, each time to determine whether to kick the other end, mainly compared to this synchronization, the guest side of the increase of avail entry, is greater than the host side to see the number of entry increase, e.g. assume that guest has added 5 avail entry and then synchronized once, Then 5 additional avail entry were added and synchronized again. However, the host side after the first synchronization only consumed 4 avail entry, then the second synchronization will find that the condition is not established, will not go to kick virtqueue, only the host side of the first synchronization after the 5 avail entry after consumption, The next synchronization will not be kick. This mechanism is somewhat similar to the concept of edge triggering.
void *virtqueue_get_buf (struct virtqueue *_vq, unsigned int *len) {struct Vring_virtqueue *vq = To_vvq (_VQ); void *ret; unsigned int i; Start_use (VQ); if (unlikely (Vq->broken)) {end_use (VQ); return NULL; } if (!more_used (VQ)) {pr_debug ("No more buffers in queue\n"); End_use (VQ); return NULL; }/* Only get used array entries after they has been exposed by host. */VIRTIO_RMB (); i = vq->vring.used->ring[vq->last_used_idx%vq->vring.num].id; /* Get LAST_USED_IDX point to Used_elem */*len = vq->vring.used->ring[vq->last_used_idx%vq->vring.num].len; if (Unlikely (I >= vq->vring.num)) {bad_ring (VQ, "id%u out of range\n", I); return NULL; } if (unlikely (!vq->data[i])) {bad_ring (VQ, "id%u is not a head!\n", i); return NULL; }/* Detach_buf clears data, so grab it now. */ret = vq->data[i]; Detach_buf (VQ, i); /* Detach_buf The Used ring idx corresponds to the VRing_desc the linked list back to the free_head point of the Idle list head */vq->last_used_idx++; /* LAST_USED_IDX self-increment */* If We expect an interrupt for the next entry, tell host * by writing event index and flush The Write before * the read in the next get_buf call. */if (!) ( Vq->vring.avail->flags & Vring_avail_f_no_interrupt) {vring_used_event (&vq->vring) = Vq->last _used_idx; /* Update LAST_USED_IDX value to front drive */VIRTIO_MB (); } end_use (VQ); return ret;}
The VIRTQUEUE_GET_BUF is used to reclaim the entry of a used ring pointed to by Last_used_idx
static void Detach_buf (struct vring_virtqueue *vq, unsigned int head) { unsigned int i; /* Clear data ptr. */ Vq->data[head] = NULL; /* Put back to free list:find end */ i = head; /* Free the indirect table */ if (Vq->vring.desc[i].flags & Vring_desc_f_indirect) Kfree (Phys_to_virt ( VQ->VRING.DESC[I].ADDR)); while (Vq->vring.desc[i].flags & vring_desc_f_next) { i = vq->vring.desc[i].next; vq->num_free++; } Vq->vring.desc[i].next = vq->free_head; Vq->free_head = head; /* Plus Final Descriptor */ vq->num_free++;}
The actual recycling is done by DETACH_BUF, which does not do anything with the actual addr address in Vring_desc, which means that the addresses have been recycled before.
VIRTQUEUE_DISABLE_CB is used to turn off interrupts, VIRTQUEUE_ENABLE_CB to open interrupts, relatively simple here, not much analysis.
void Virtqueue_disable_cb (struct virtqueue *_vq) { struct vring_virtqueue *vq = To_vvq (_VQ); Vq->vring.avail->flags |= Vring_avail_f_no_interrupt;}
BOOL VIRTQUEUE_ENABLE_CB (struct Virtqueue *_vq) { struct vring_virtqueue *vq = To_vvq (_VQ); Start_use (VQ); /* We optimistically turn back on interrupts and then check if there is * more to do. *// * Depending on the Virtio_ RING_F_EVENT_IDX feature, we need to * Either clear the flags bit or point the EVENT index at the next * entry. Always does both to keep code simple. */ vq->vring.avail->flags &= ~vring_avail_f_no_interrupt; Vring_used_event (&vq->vring) = vq->last_used_idx; VIRTIO_MB (); if (Unlikely (more_used (VQ))) { end_use (VQ); return false; } End_use (VQ); return true;}
Vring Queue for Virtio