Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

PWX-31760 kernel page fragment allocator #281

Open
wants to merge 2 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
162 changes: 162 additions & 0 deletions io.c
Original file line number Diff line number Diff line change
Expand Up @@ -227,6 +227,81 @@ static void io_sq_wq_submit_work(struct work_struct *work);

struct kmem_cache *req_cachep;

static void *io_alloc_msg_buf(struct io_ring_ctx *ctx)
{
if (ctx->nr_msg_bufs == 0) {
return NULL;
}
--ctx->nr_msg_bufs;
return ctx->msg_bufs[ctx->nr_msg_bufs];
}

/* initial ref count value, greater than max number of allocations to guarantee counter
* stays positive while allocator owns page */
#define ALLOC_BIAS 4096

static void io_new_fragment_page(struct user_page_fragment_allocator *alloc)
{
alloc->offset = 0;
alloc->num_allocs = 0;
alloc->page->refs.counter = ALLOC_BIAS;
}

void *io_alloc_fragment(struct io_ring_ctx *ctx, size_t size)
{
void *ret;
struct user_page_fragment_allocator *alloc = &ctx->fragment_allocator;

if (alloc->page == NULL) {
alloc->page = io_alloc_msg_buf(ctx);
if (alloc->page == NULL) {
return NULL;
}
io_new_fragment_page(alloc);
}

size = roundup(size, 8);
if (size + alloc->offset > ARRAY_SIZE(alloc->page->data)) {
if (size > ARRAY_SIZE(alloc->page->data)) {
return NULL;
}
/* Allocate new page if page still in use by user space, otherwise
* reuse the same page.
*/
if (!atomic_sub_and_test(ALLOC_BIAS - alloc->num_allocs,
&alloc->page->refs)) {
alloc->page = io_alloc_msg_buf(ctx);
if (alloc->page == NULL) {
return NULL;
}
}
io_new_fragment_page(alloc);
}

++alloc->num_allocs;
ret = &alloc->page->data[alloc->offset];
alloc->offset += size;
return ret;
}

static void io_alloc_fragment_init(struct user_page_fragment_allocator *alloc)
{
alloc->page = NULL;
}

static void *io_alloc_fragment_clear(struct user_page_fragment_allocator *alloc)
{
struct user_page_fragment *page = alloc->page;

if (page != NULL) {
alloc->page = NULL;
if (atomic_sub_and_test(ALLOC_BIAS - alloc->num_allocs, &page->refs)) {
return page;
}
}
return NULL;
}

static void io_ring_ctx_ref_free(struct percpu_ref *ref)
{
struct io_ring_ctx *ctx = container_of(ref, struct io_ring_ctx, refs);
Expand Down Expand Up @@ -288,9 +363,19 @@ static int io_ring_ctx_init(struct io_ring_ctx *ctx, struct io_uring_params *par
}
ctx->nr_user_files = IORING_MAX_FIXED_FILES;

ctx->nr_msg_bufs = 0;
ctx->msg_bufs = kcalloc(PXD_IO_MAX_MSG_BUFS, sizeof(void *), GFP_KERNEL);
if (ctx->msg_bufs == NULL) {
vfree(ctx->queue);
kfree(ctx->user_files);
}

io_alloc_fragment_init(&ctx->fragment_allocator);

if (percpu_ref_init(&ctx->refs, io_ring_ctx_ref_free, 0, GFP_KERNEL)) {
vfree(ctx->queue);
kfree(ctx->user_files);
kfree(ctx->msg_bufs);
return -ENOMEM;
}

Expand Down Expand Up @@ -2586,6 +2671,79 @@ static int io_sqe_register_region(struct io_ring_ctx *ctx, void __user *uarg)
return ret;
}

static int io_sqe_give_buffers(struct io_ring_ctx *ctx, void __user *uarg)
{
struct pxd_ioc_give_buffers arg;

if (copy_from_user(&arg, uarg, sizeof(arg))) {
return -EFAULT;
}

if (ctx->nr_msg_bufs + arg.count > PXD_IO_MAX_MSG_BUFS) {
return -EINVAL;
}

if (copy_from_user(&ctx->msg_bufs[ctx->nr_msg_bufs], arg.buffers,
arg.count * sizeof(void *))) {
return -EFAULT;
}

ctx->nr_msg_bufs += arg.count;

pr_info("%s: count %ld nr_bufs %d", __func__, arg.count, ctx->nr_msg_bufs);

return 0;
}

static int io_sqe_free_buffers(struct io_ring_ctx *ctx, struct pxd_ioc_free_buffers __user *uarg)
{
struct pxd_ioc_free_buffers arg;
void **bufs;
int i = 0;

if (copy_from_user(&arg, uarg, sizeof(arg))) {
return -EFAULT;
}

if (arg.count < 0) {
return -EINVAL;
}

if (arg.count == 0) {
return 0;
}

bufs = kcalloc(arg.count, sizeof(void *), GFP_KERNEL);
if (bufs == NULL) {
return -ENOMEM;
}

// try freeing fragment page
bufs[i] = io_alloc_fragment_clear(&ctx->fragment_allocator);
if (bufs[i] != NULL) {
++i;
}

for (; i < arg.count; ++i) {
bufs[i] = io_alloc_msg_buf(ctx);
if (bufs[i] == NULL) {
break;
}
}

if (copy_to_user(arg.buffers, bufs, i * sizeof(void *))) {
/* failed to copy, take back the buffers */
memcpy(&ctx->msg_bufs[ctx->nr_msg_bufs], bufs,
i * sizeof(void *));
ctx->nr_msg_bufs += i;
kfree(bufs);
return -EFAULT;
}

kfree(bufs);
return i;
}

static int io_sq_offload_start(struct io_ring_ctx *ctx, struct io_uring_params *p)
{
int ret;
Expand Down Expand Up @@ -2944,6 +3102,10 @@ static long io_uring_ioctl(struct file *filp, unsigned int cmd, unsigned long ar
return io_sqe_buffer_unregister(ctx);
case PXD_IOC_REGISTER_REGION:
return io_sqe_register_region(ctx, (void *) arg);
case PXD_IOC_GIVE_BUFFERS:
return io_sqe_give_buffers(ctx, (void *) arg);
case PXD_IOC_FREE_BUFFERS:
return io_sqe_free_buffers(ctx, (void *) arg);
default:
return -ENOTTY;
}
Expand Down
18 changes: 18 additions & 0 deletions io.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,18 @@ struct io_mapped_ubuf {
unsigned int nr_bvecs;
};

struct user_page_fragment {
atomic_t refs;
int pad;
char data[4088];
};

struct user_page_fragment_allocator {
struct user_page_fragment *page;
int offset; /* offset to the next allocation */
int num_allocs; /* count the allocations to batch update atomic page counter */
};

struct io_ring_ctx {
struct {
struct percpu_ref refs;
Expand Down Expand Up @@ -111,6 +123,12 @@ struct io_ring_ctx {
#define PXD_IO_MAX_USER_BUFS 16
struct io_mapped_ubuf user_bufs[PXD_IO_MAX_USER_BUFS];

#define PXD_IO_MAX_MSG_BUFS 4096
unsigned nr_msg_bufs;
void **msg_bufs;

struct user_page_fragment_allocator fragment_allocator;

struct completion ctx_done;

struct {
Expand Down
13 changes: 13 additions & 0 deletions pxd.h
Original file line number Diff line number Diff line change
Expand Up @@ -47,6 +47,8 @@
#define PXD_IOC_REGISTER_BUFFERS _IO(PXD_IOCTL_MAGIC, 13) /* 0x50580d */
#define PXD_IOC_UNREGISTER_BUFFERS _IO(PXD_IOCTL_MAGIC, 14) /* 0x50580e */
#define PXD_IOC_REGISTER_REGION _IO(PXD_IOCTL_MAGIC, 15)
#define PXD_IOC_GIVE_BUFFERS _IO(PXD_IOCTL_MAGIC, 16)
#define PXD_IOC_FREE_BUFFERS _IO(PXD_IOCTL_MAGIC, 17)

struct pxd_ioc_register_buffers {
void *base;
Expand All @@ -59,6 +61,17 @@ struct pxd_ioc_register_region {
size_t len;
};

struct pxd_ioc_give_buffers {
size_t count; /* number of entries in buffers */
void *const *buffers; /* list of buffers to transfer to the kernel */
};

/* returns number of buffers returned to user space */
struct pxd_ioc_free_buffers {
size_t count; /* number of entries in buffers */
void **buffers; /* list of buffers returned to user space filled by ioctl */
};

#define PXD_MAX_DEVICES 512 /**< maximum number of devices supported */
#define PXD_MAX_IO (1024*1024) /**< maximum io size in bytes */
#define PXD_MAX_QDEPTH 256 /**< maximum device queue depth */
Expand Down