aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--ChangeLog4
-rw-r--r--Makefile.am3
-rw-r--r--doc/kernel.txt130
-rw-r--r--kernel/dev.c144
4 files changed, 245 insertions, 36 deletions
diff --git a/ChangeLog b/ChangeLog
index 9ffdbda..ba85a21 100644
--- a/ChangeLog
+++ b/ChangeLog
@@ -1,3 +1,7 @@
+2005-03-24 Miklos Szeredi <miklos@szeredi.hu>
+
+ * kernel: trivial cleanups
+
2005-03-21 Miklos Szeredi <miklos@szeredi.hu>
* Add fsyncdir() operation
diff --git a/Makefile.am b/Makefile.am
index 28f5b32..caeaed7 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -7,7 +7,8 @@ EXTRA_DIST = \
README* \
Filesystems \
FAQ \
- doc/how-fuse-works
+ doc/how-fuse-works \
+ doc/kernel.txt
pkgconfigdir = $(libdir)/pkgconfig
pkgconfig_DATA = fuse.pc
diff --git a/doc/kernel.txt b/doc/kernel.txt
new file mode 100644
index 0000000..6fcfada
--- /dev/null
+++ b/doc/kernel.txt
@@ -0,0 +1,130 @@
+The following diagram shows how a filesystem operation (in this
+example unlink) is performed in FUSE.
+
+NOTE: everything in this description is greatly simplified
+
+ | "rm /mnt/fuse/file" | FUSE filesystem daemon
+ | |
+ | | >sys_read()
+ | | >fuse_dev_read()
+ | | >request_wait()
+ | | [sleep on fc->waitq]
+ | |
+ | >sys_unlink() |
+ | >fuse_unlink() |
+ | [get request from |
+ | fc->unused_list] |
+ | >request_send() |
+ | [queue req on fc->pending] |
+ | [wake up fc->waitq] | [woken up]
+ | >request_wait_answer() |
+ | [sleep on req->waitq] |
+ | | <request_wait()
+ | | [remove req from fc->pending]
+ | | [copy req to read buffer]
+ | | [add req to fc->processing]
+ | | <fuse_dev_read()
+ | | <sys_read()
+ | |
+ | | [perform unlink]
+ | |
+ | | >sys_write()
+ | | >fuse_dev_write()
+ | | [look up req in fc->processing]
+ | | [remove from fc->processing]
+ | | [copy write buffer to req]
+ | [woken up] | [wake up req->waitq]
+ | | <fuse_dev_write()
+ | | <sys_write()
+ | <request_wait_answer() |
+ | <request_send() |
+ | [add request to |
+ | fc->unused_list] |
+ | <fuse_unlink() |
+ | <sys_unlink() |
+
+There are a couple of ways in which to deadlock a FUSE filesystem.
+Since we are talking about unprivileged userspace programs,
+something must be done about these.
+
+Scenario 1 - Simple deadlock
+-----------------------------
+
+ | "rm /mnt/fuse/file" | FUSE filesystem daemon
+ | |
+ | >sys_unlink("/mnt/fuse/file") |
+ | [acquire inode semaphore |
+ | for "file"] |
+ | >fuse_unlink() |
+ | [sleep on req->waitq] |
+ | | <sys_read()
+ | | >sys_unlink("/mnt/fuse/file")
+ | | [acquire inode semaphore
+ | | for "file"]
+ | | *DEADLOCK*
+
+The solution for this is to allow requests to be interrupted while
+they are in userspace:
+
+ | [interrupted by signal] |
+ | <fuse_unlink() |
+ | [release semaphore] | [semaphore acquired]
+ | <sys_unlink() |
+ | | >fuse_unlink()
+ | | [queue req on fc->pending]
+ | | [wake up fc->waitq]
+ | | [sleep on req->waitq]
+
+If the filesystem daemon was single threaded, this will stop here,
+since there's no other thread to dequeue and execute the request.
+In this case the solution is to kill the FUSE daemon as well. If
+there are multiple serving threads, you just have to kill them as
+long as any remain.
+
+Moral: a filesystem which deadlocks, can soon find itself dead.
+
+Scenario 2 - Tricky deadlock
+----------------------------
+
+This one needs a carefully crafted filesystem. It's a variation on
+the above, only the call back to the filesystem is not explicit,
+but is caused by a pagefault.
+
+ | Kamikaze filesystem thread 1 | Kamikaze filesystem thread 2
+ | |
+ | [fd = open("/mnt/fuse/file")] | [request served normally]
+ | [mmap fd to 'addr'] |
+ | [close fd] | [FLUSH triggers 'magic' flag]
+ | [read a byte from addr] |
+ | >do_page_fault() |
+ | [find or create page] |
+ | [lock page] |
+ | >fuse_readpage() |
+ | [queue READ request] |
+ | [sleep on req->waitq] |
+ | | [read request to buffer]
+ | | [create reply header before addr]
+ | | >sys_write(addr - headerlength)
+ | | >fuse_dev_write()
+ | | [look up req in fc->processing]
+ | | [remove from fc->processing]
+ | | [copy write buffer to req]
+ | | >do_page_fault()
+ | | [find or create page]
+ | | [lock page]
+ | | * DEADLOCK *
+
+Solution is again to let the the request be interrupted (not
+elaborated further).
+
+An additional problem is that while the write buffer is being
+copied to the request, the request must not be interrupted. This
+is because the destination address of the copy may not be valid
+after the request is interrupted.
+
+This is solved with doing the copy atomically, and allowing
+interruption while the page(s) belonging to the write buffer are
+faulted with get_user_pages(). The 'req->locked' flag indicates
+when the copy is taking place, and interruption is delayed until
+this flag is unset.
+
diff --git a/kernel/dev.c b/kernel/dev.c
index 5941035..b17296a 100644
--- a/kernel/dev.c
+++ b/kernel/dev.c
@@ -146,6 +146,11 @@ struct fuse_req *fuse_get_request(struct fuse_conn *fc)
return do_get_request(fc);
}
+/*
+ * Non-interruptible version of the above function is for operations
+ * which can't legally return -ERESTART{SYS,NOINTR}. This can still
+ * return NULL, but only in case the signal is SIGKILL.
+ */
struct fuse_req *fuse_get_request_nonint(struct fuse_conn *fc)
{
int intr;
@@ -165,6 +170,7 @@ static void fuse_putback_request(struct fuse_conn *fc, struct fuse_req *req)
else
fuse_request_free(req);
+ /* If we are in debt decrease that first */
if (fc->outstanding_debt)
fc->outstanding_debt--;
else
@@ -180,10 +186,8 @@ void fuse_put_request(struct fuse_conn *fc, struct fuse_req *req)
void fuse_release_background(struct fuse_req *req)
{
- if (req->inode)
- iput(req->inode);
- if (req->inode2)
- iput(req->inode2);
+ iput(req->inode);
+ iput(req->inode2);
if (req->file)
fput(req->file);
spin_lock(&fuse_lock);
@@ -191,7 +195,17 @@ void fuse_release_background(struct fuse_req *req)
spin_unlock(&fuse_lock);
}
-/* Called with fuse_lock, unlocks it */
+/*
+ * This function is called when a request is finished. Either a reply
+ * has arrived or it was interrupted (and not yet sent) or some error
+ * occured during communication with userspace, or the device file was
+ * closed. It decreases the referece count for the request. In case
+ * of a background request the referece to the stored objects are
+ * released. The requester thread is woken up (if still waiting), and
+ * finally the request is either freed or put on the unused_list
+ *
+ * Called with fuse_lock, unlocks it
+ */
static void request_end(struct fuse_conn *fc, struct fuse_req *req)
{
int putback;
@@ -218,10 +232,37 @@ static void request_end(struct fuse_conn *fc, struct fuse_req *req)
fuse_putback_request(fc, req);
}
+/*
+ * Unfortunately request interruption not just solves the deadlock
+ * problem, it causes problems too. These stem from the fact, that an
+ * interrupted request is continued to be processed in userspace,
+ * while all the locks and object references (inode and file) held
+ * during the operation are released.
+ *
+ * To release the locks is exactly why there's a need to interrupt the
+ * request, so there's not a lot that can be done about this, except
+ * introduce additional locking in userspace.
+ *
+ * More important is to keep inode and file references until userspace
+ * has replied, otherwise FORGET and RELEASE could be sent while the
+ * inode/file is still used by the filesystem.
+ *
+ * For this reason the concept of "background" request is introduced.
+ * An interrupted request is backgrounded if it has been already sent
+ * to userspace. Backgrounding involves getting an extra reference to
+ * inode(s) or file used in the request, and adding the request to
+ * fc->background list. When a reply is received for a background
+ * request, the object references are released, and the request is
+ * removed from the list. If the filesystem is unmounted while there
+ * are still background requests, the list is walked and references
+ * are released as if a reply was received.
+ *
+ * There's one more use for a background request. The RELEASE message is
+ * always sent as background, since it doesn't return an error or
+ * data.
+ */
static void background_request(struct fuse_conn *fc, struct fuse_req *req)
{
- /* Need to get hold of the inode(s) and/or file used in the
- request, so FORGET and RELEASE are not sent too early */
req->background = 1;
list_add(&req->bg_entry, &fc->background);
if (req->inode)
@@ -273,7 +314,7 @@ static void request_wait_answer(struct fuse_conn *fc, struct fuse_req *req,
if (req->locked) {
/* This is uninterruptible sleep, because data is
being copied to/from the buffers of req. During
- locked state, there musn't be any filesystem
+ locked state, there mustn't be any filesystem
operation (e.g. page fault), since that could lead
to deadlock */
spin_unlock(&fuse_lock);
@@ -308,7 +349,12 @@ static void queue_request(struct fuse_conn *fc, struct fuse_req *req)
req->in.h.len = sizeof(struct fuse_in_header) +
len_args(req->in.numargs, (struct fuse_arg *) req->in.args);
if (!req->preallocated) {
- /* decrease outstanding_sem, but without blocking... */
+ /* If request is not preallocated (either FORGET or
+ RELEASE), then still decrease outstanding_sem, so
+ user can't open infinite number of files while not
+ processing the RELEASE requests. However for
+ efficiency do it without blocking, so if down()
+ would block, just increase the debt instead */
if (down_trylock(&fc->outstanding_sem))
fc->outstanding_debt++;
}
@@ -338,6 +384,11 @@ void request_send(struct fuse_conn *fc, struct fuse_req *req)
request_send_wait(fc, req, 1);
}
+/*
+ * Non-interruptible version of the above function is for operations
+ * which can't legally return -ERESTART{SYS,NOINTR}. This can still
+ * be interrupted but only with SIGKILL.
+ */
void request_send_nonint(struct fuse_conn *fc, struct fuse_req *req)
{
request_send_wait(fc, req, 0);
@@ -388,6 +439,11 @@ void fuse_send_init(struct fuse_conn *fc)
request_send_background(fc, req);
}
+/*
+ * Lock the request. Up to the next unlock_request() there mustn't be
+ * anything that could cause a page-fault. If the request was already
+ * interrupted bail out.
+ */
static inline int lock_request(struct fuse_req *req)
{
int err = 0;
@@ -402,6 +458,11 @@ static inline int lock_request(struct fuse_req *req)
return err;
}
+/*
+ * Unlock request. If it was interrupted during being locked, the
+ * requester thread is currently waiting for it to be unlocked, so
+ * wake it up.
+ */
static inline void unlock_request(struct fuse_req *req)
{
if (req) {
@@ -413,15 +474,6 @@ static inline void unlock_request(struct fuse_req *req)
}
}
-/* Why all this complex one-page-at-a-time copying needed instead of
- just copy_to/from_user()? The reason is that blocking on a page
- fault must be avoided while the request is locked. This is because
- if servicing that pagefault happens to be done by this filesystem,
- an unbreakable deadlock can occur. So the code is careful to allow
- request interruption during get_user_pages(), and only lock the
- request while doing kmapped copying, which cannot block.
- */
-
struct fuse_copy_state {
int write;
struct fuse_req *req;
@@ -435,26 +487,18 @@ struct fuse_copy_state {
unsigned len;
};
-static unsigned fuse_copy_init(struct fuse_copy_state *cs, int write,
- struct fuse_req *req, const struct iovec *iov,
- unsigned long nr_segs)
+static void fuse_copy_init(struct fuse_copy_state *cs, int write,
+ struct fuse_req *req, const struct iovec *iov,
+ unsigned long nr_segs)
{
- unsigned i;
- unsigned nbytes;
-
memset(cs, 0, sizeof(*cs));
cs->write = write;
cs->req = req;
cs->iov = iov;
cs->nr_segs = nr_segs;
-
- nbytes = 0;
- for (i = 0; i < nr_segs; i++)
- nbytes += iov[i].iov_len;
-
- return nbytes;
}
+/* Unmap and put previous page of userspace buffer */
static inline void fuse_copy_finish(struct fuse_copy_state *cs)
{
if (cs->mapaddr) {
@@ -468,6 +512,10 @@ static inline void fuse_copy_finish(struct fuse_copy_state *cs)
}
}
+/*
+ * Get another pagefull of userspace buffer, and map it to kernel
+ * address space, and lock request
+ */
static int fuse_copy_fill(struct fuse_copy_state *cs)
{
unsigned long offset;
@@ -499,6 +547,7 @@ static int fuse_copy_fill(struct fuse_copy_state *cs)
return lock_request(cs->req);
}
+/* Do as much copy to/from userspace buffer as we can */
static inline int fuse_copy_do(struct fuse_copy_state *cs, void **val,
unsigned *size)
{
@@ -516,6 +565,10 @@ static inline int fuse_copy_do(struct fuse_copy_state *cs, void **val,
return ncpy;
}
+/*
+ * Copy a page in the request to/from the userspace buffer. Must be
+ * done atomically
+ */
static inline int fuse_copy_page(struct fuse_copy_state *cs, struct page *page,
unsigned offset, unsigned count, int zeroing)
{
@@ -541,6 +594,7 @@ static inline int fuse_copy_page(struct fuse_copy_state *cs, struct page *page,
return 0;
}
+/* Copy pages in the request to/from userspace buffer */
static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
int zeroing)
{
@@ -562,6 +616,7 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes,
return 0;
}
+/* Copy a single argument in the request to/from userspace buffer */
static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size)
{
while (size) {
@@ -573,6 +628,7 @@ static int fuse_copy_one(struct fuse_copy_state *cs, void *val, unsigned size)
return 0;
}
+/* Copy request arguments to/from userspace buffer */
static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs,
unsigned argpages, struct fuse_arg *args,
int zeroing)
@@ -590,6 +646,7 @@ static int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs,
return err;
}
+/* Wait until a request is available on the pending list */
static void request_wait(struct fuse_conn *fc)
{
DECLARE_WAITQUEUE(wait, current);
@@ -608,6 +665,15 @@ static void request_wait(struct fuse_conn *fc)
remove_wait_queue(&fc->waitq, &wait);
}
+/*
+ * Read a single request into the userspace filesystem's buffer. This
+ * function waits until a request is available, then removes it from
+ * the pending list and copies request data to userspace buffer. If
+ * no reply is needed (FORGET) or request has been interrupted or
+ * there was an error during the copying then it's finished by calling
+ * request_end(). Otherwise add it to the processing list, and set
+ * the 'sent' flag.
+ */
static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
unsigned long nr_segs, loff_t *off)
{
@@ -616,7 +682,6 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
struct fuse_req *req;
struct fuse_in *in;
struct fuse_copy_state cs;
- unsigned nbytes;
unsigned reqsize;
spin_lock(&fuse_lock);
@@ -638,9 +703,9 @@ static ssize_t fuse_dev_readv(struct file *file, const struct iovec *iov,
in = &req->in;
reqsize = req->in.h.len;
- nbytes = fuse_copy_init(&cs, 1, req, iov, nr_segs);
+ fuse_copy_init(&cs, 1, req, iov, nr_segs);
err = -EINVAL;
- if (nbytes >= reqsize) {
+ if (iov_length(iov, nr_segs) >= reqsize) {
err = fuse_copy_one(&cs, &in->h, sizeof(in->h));
if (!err)
err = fuse_copy_args(&cs, in->numargs, in->argpages,
@@ -681,6 +746,7 @@ static ssize_t fuse_dev_read(struct file *file, char __user *buf,
return fuse_dev_readv(file, &iov, 1, off);
}
+/* Look up request on processing list by unique ID */
static struct fuse_req *request_find(struct fuse_conn *fc, unsigned unique)
{
struct list_head *entry;
@@ -717,11 +783,18 @@ static int copy_out_args(struct fuse_copy_state *cs, struct fuse_out *out,
out->page_zeroing);
}
+/*
+ * Write a single reply to a request. First the header is copied from
+ * the write buffer. The request is then searched on the processing
+ * list by the unique ID found in the header. If found, then remove
+ * it from the list and copy the rest of the buffer to the request.
+ * The request is finished by calling request_end()
+ */
static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov,
unsigned long nr_segs, loff_t *off)
{
int err;
- unsigned nbytes;
+ unsigned nbytes = iov_length(iov, nr_segs);
struct fuse_req *req;
struct fuse_out_header oh;
struct fuse_copy_state cs;
@@ -729,7 +802,7 @@ static ssize_t fuse_dev_writev(struct file *file, const struct iovec *iov,
if (!fc)
return -ENODEV;
- nbytes = fuse_copy_init(&cs, 0, NULL, iov, nr_segs);
+ fuse_copy_init(&cs, 0, NULL, iov, nr_segs);
if (nbytes < sizeof(struct fuse_out_header))
return -EINVAL;
@@ -806,6 +879,7 @@ static unsigned fuse_dev_poll(struct file *file, poll_table *wait)
return mask;
}
+/* Abort all requests on the given list (pending or processing) */
static void end_requests(struct fuse_conn *fc, struct list_head *head)
{
while (!list_empty(head)) {