Discussion:
[PATCH] cowlinks v2
(too old to reply)
Jörn Engel
2004-03-20 08:34:11 UTC
Permalink
Hi!

Version 2 of my cowlink patch, tested and currently running on my
machine.

Al, I'd especially like your opinion on it. Would you accept
something like this?

Changes since v1:
o moved break_cow_link() check to get_write_access()
o added inode locking when changing flags
o switched to mark_inode_dirty_sync()

TODO:
o Disallow fcntl() for filesystems without support
o Proper support for ext[23]
o Switch to mark_inode_dirty() without sync?
o Library support for
o copyfile() (link and set cow-flag)
o cow_open() (break link if open() fails)

J=F6rn

--=20
Premature optimization is the root of all evil.
-- Donald Knuth


fs/ext2/inode.c | 3 +-
fs/ext3/inode.c | 4 ++
fs/fcntl.c | 21 +++++++++++++++
fs/namei.c | 70 +++++++++++++++++++++++++++++++++++++++++=
+++++++++
include/linux/fcntl.h | 3 ++
include/linux/fs.h | 3 ++
6 files changed, 102 insertions(+), 2 deletions(-)

--- linux-2.6.4/include/linux/fcntl.h~cowlink 2004-03-19 17:38:48.00000=
0000 +0100
+++ linux-2.6.4/include/linux/fcntl.h 2004-03-19 17:52:49.000000000 +01=
00
@@ -23,6 +23,9 @@
#define DN_ATTRIB 0x00000020 /* File changed attibutes */
#define DN_MULTISHOT 0x80000000 /* Don't remove notifier */
=20
+#define F_SETCOW (F_LINUX_SPECIFIC_BASE+3)
+#define F_GETCOW (F_LINUX_SPECIFIC_BASE+4)
+
#ifdef __KERNEL__
=20
#if BITS_PER_LONG =3D=3D 32
--- linux-2.6.4/include/linux/fs.h~cowlink 2004-03-19 17:47:29.00000000=
0 +0100
+++ linux-2.6.4/include/linux/fs.h 2004-03-19 17:52:49.000000000 +0100
@@ -137,6 +137,9 @@
#define S_DEAD 32 /* removed, but still open directory */
#define S_NOQUOTA 64 /* Inode is not counted to quota */
#define S_DIRSYNC 128 /* Directory modifications are synchronous */
+#define S_COWLINK 256 /* Hard links have copy on write semantics.
+ * This flag has no meaning for directories,
+ * but is inherited to directory children */
=20
/*
* Note that nosuid etc flags are inode-specific: setting some file-sy=
stem
--- linux-2.6.4/fs/fcntl.c~cowlink 2004-03-19 17:47:15.000000000 +0100
+++ linux-2.6.4/fs/fcntl.c 2004-03-19 17:59:20.000000000 +0100
@@ -282,6 +282,20 @@
=20
EXPORT_SYMBOL(f_delown);
=20
+static long fcntl_setcow(struct file *filp, unsigned long arg)
+{
+ struct inode *inode =3D filp->f_dentry->d_inode;
+
+ spin_lock(&inode->i_lock);
+ if (arg)
+ inode->i_flags |=3D S_COWLINK;
+ else
+ inode->i_flags &=3D ~S_COWLINK;
+ mark_inode_dirty_sync(inode);
+ spin_unlock(&inode->i_lock);
+ return 0;
+}
+
static long do_fcntl(unsigned int fd, unsigned int cmd,
unsigned long arg, struct file * filp)
{
@@ -346,6 +360,13 @@
case F_NOTIFY:
err =3D fcntl_dirnotify(fd, filp, arg);
break;
+ case F_SETCOW:
+ err =3D fcntl_setcow(filp, arg);
+ break;
+ case F_GETCOW:
+ err =3D (filp->f_dentry->d_inode->i_flags & S_COWLINK) /
+ S_COWLINK;
+ break;
default:
break;
}
--- linux-2.6.4/fs/namei.c~cowlink 2004-03-19 17:47:19.000000000 +0100
+++ linux-2.6.4/fs/namei.c 2004-03-19 18:10:00.000000000 +0100
@@ -224,6 +224,33 @@
}
=20
/*
+ * Files with the S_COWLINK flag set cannot be written to, if more
+ * than one hard link to them exists. Ultimately, this function
+ * should copy the inode, assign the copy to the dentry and lower use
+ * count of the old inode - one day.
+ * For now, it is sufficient to return an error and let userspace
+ * deal with the messy part. Not exactly the meaning of
+ * copy-on-write, but much better than writing to fifty files at once
+ * and noticing month later.
+ *
+ * Yes, this breaks the kernel interface and is simply wrong. This
+ * is intended behaviour, so Linus will not merge the code before
+ * it is complete. Or will he?
+ */
+static int break_cow_link(struct inode *inode)
+{
+ if (!(inode->i_flags & S_COWLINK))
+ return 0;
+ if (!S_ISREG(inode->i_mode))
+ return 0;
+ if (inode->i_nlink < 2)
+ return 0;
+ /* TODO: As soon as sendfile can do normal file copies, use that
+ * and always return 0 */
+ return -EMLINK;
+}
+
+/*
* get_write_access() gets write permission for a file.
* put_write_access() releases this write permission.
* This is used for regular files.
@@ -243,7 +270,14 @@
=20
int get_write_access(struct inode * inode)
{
+ int error;
+
spin_lock(&inode->i_lock);
+ error =3D break_cow_link(inode);
+ if (error) {
+ spin_unlock(&inode->i_lock);
+ return error;
+ }
if (atomic_read(&inode->i_writecount) < 0) {
spin_unlock(&inode->i_lock);
return -ETXTBSY;
@@ -1148,6 +1182,10 @@
if (!error) {
inode_dir_notify(dir, DN_CREATE);
security_inode_post_create(dir, dentry, mode);
+ spin_lock(&inode->i_lock);
+ dentry->d_inode->i_flags |=3D dir->i_flags & S_COWLINK;
+ mark_inode_dirty_sync(inode);
+ spin_unlock(&inode->i_lock);
}
return error;
}
@@ -1522,6 +1560,9 @@
if (!error) {
inode_dir_notify(dir, DN_CREATE);
security_inode_post_mkdir(dir,dentry, mode);
+ spin_lock(&inode->i_lock);
+ dentry->d_inode->i_flags |=3D dir->i_flags & S_COWLINK;
+ spin_unlock(&inode->i_lock);
}
return error;
}
@@ -1820,6 +1861,13 @@
return -EXDEV;
=20
/*
+ * Cowlink attribute is inherited from directory, but here,
+ * the inode already has one. If they don't match, bail out.
+ */
+ if ((dir->i_flags ^ old_dentry->d_inode->i_flags) & S_COWLINK)
+ return -EMLINK;
+
+ /*
* A link to an append-only or immutable file cannot be created.
*/
if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
@@ -1997,6 +2045,24 @@
return error;
}
=20
+static int cow_allow_rename(struct inode *old_dir, struct dentry *old_=
dentry,
+ struct inode *new_dir)
+{
+ /* source and target share directory: allow */
+ if (old_dir =3D=3D new_dir)
+ return 0;
+ /* source and target directory have identical cowlink flag: allow */
+ if (! ((old_dentry->d_inode->i_flags ^ new_dir->i_flags) & S_COWLINK)=
)
+ return 0;
+ /* We could always fail here, but cowlink flag is only defined for
+ * files and directories, so let's allow special files */
+ if (!S_ISREG(old_dentry->d_inode->i_mode))
+ return -EMLINK;
+ if (!S_ISDIR(old_dentry->d_inode->i_mode))
+ return -EMLINK;
+ return 0;
+}
+
int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
{
@@ -2020,6 +2086,10 @@
if (!old_dir->i_op || !old_dir->i_op->rename)
return -EPERM;
=20
+ error =3D cow_allow_rename(old_dir, old_dentry, new_dir);
+ if (error)
+ return error;
+
DQUOT_INIT(old_dir);
DQUOT_INIT(new_dir);
=20
--- linux-2.6.4/fs/ext2/inode.c~cowlink 2004-03-19 17:44:02.000000000 +=
0100
+++ linux-2.6.4/fs/ext2/inode.c 2004-03-19 17:52:49.000000000 +0100
@@ -1020,6 +1020,7 @@
{
unsigned int flags =3D EXT2_I(inode)->i_flags;
=20
+ inode->i_flags =3D flags;
inode->i_flags &=3D ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC=
);
if (flags & EXT2_SYNC_FL)
inode->i_flags |=3D S_SYNC;
@@ -1191,7 +1192,7 @@
=20
raw_inode->i_blocks =3D cpu_to_le32(inode->i_blocks);
raw_inode->i_dtime =3D cpu_to_le32(ei->i_dtime);
- raw_inode->i_flags =3D cpu_to_le32(ei->i_flags);
+ raw_inode->i_flags =3D cpu_to_le32(inode->i_flags);
raw_inode->i_faddr =3D cpu_to_le32(ei->i_faddr);
raw_inode->i_frag =3D ei->i_frag_no;
raw_inode->i_fsize =3D ei->i_frag_size;
--- linux-2.6.4/fs/ext3/inode.c~cowlink 2004-03-19 17:44:02.000000000 +=
0100
+++ linux-2.6.4/fs/ext3/inode.c 2004-03-19 17:52:49.000000000 +0100
@@ -2447,6 +2447,7 @@
{
unsigned int flags =3D EXT3_I(inode)->i_flags;
=20
+ inode->i_flags =3D flags;
inode->i_flags &=3D ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC=
);
if (flags & EXT3_SYNC_FL)
inode->i_flags |=3D S_SYNC;
@@ -2629,7 +2630,8 @@
raw_inode->i_mtime =3D cpu_to_le32(inode->i_mtime.tv_sec);
raw_inode->i_blocks =3D cpu_to_le32(inode->i_blocks);
raw_inode->i_dtime =3D cpu_to_le32(ei->i_dtime);
- raw_inode->i_flags =3D cpu_to_le32(ei->i_flags);
+ raw_inode->i_flags =3D cpu_to_le32((ei->i_flags & ~S_COWLINK) |
+ (inode->i_flags & S_COWLINK));
#ifdef EXT3_FRAGMENTS
raw_inode->i_faddr =3D cpu_to_le32(ei->i_faddr);
raw_inode->i_frag =3D ei->i_frag_no;
Andrew Morton
2004-03-20 08:49:01 UTC
Permalink
Post by Jörn Engel
+static long fcntl_setcow(struct file *filp, unsigned long arg)
+{
+ struct inode *inode =3D filp->f_dentry->d_inode;
+
+ spin_lock(&inode->i_lock);
+ if (arg)
+ inode->i_flags |=3D S_COWLINK;
+ else
+ inode->i_flags &=3D ~S_COWLINK;
+ mark_inode_dirty_sync(inode);
+ spin_unlock(&inode->i_lock);
+ return 0;
+}
+
i_lock is an innermost lock. No locks should be taken inside i_lock.

Here, not only is inode_lock being taken inside i_lock but ->dirty_inod=
e
may be called as well, and dirty_inode() may not be called under any
spinlock.
Jörn Engel
2004-03-20 11:27:18 UTC
Permalink
Post by Andrew Morton
Post by Jörn Engel
+static long fcntl_setcow(struct file *filp, unsigned long arg)
+{
+ struct inode *inode =3D filp->f_dentry->d_inode;
+
+ spin_lock(&inode->i_lock);
+ if (arg)
+ inode->i_flags |=3D S_COWLINK;
+ else
+ inode->i_flags &=3D ~S_COWLINK;
+ mark_inode_dirty_sync(inode);
+ spin_unlock(&inode->i_lock);
+ return 0;
+}
+
=20
i_lock is an innermost lock. No locks should be taken inside i_lock.
=20
Here, not only is inode_lock being taken inside i_lock but ->dirty_in=
ode
Post by Andrew Morton
may be called as well, and dirty_inode() may not be called under any
spinlock.
Is it enough to move the mark_inode_dirty outside of i_lock?

New patch below.

J=F6rn

--=20
The cheapest, fastest and most reliable components of a computer
system are those that aren't there.
-- Gordon Bell, DEC labratories

--- linux-2.6.4/include/linux/fcntl.h~cowlink 2004-03-19 17:38:48.00000=
0000 +0100
+++ linux-2.6.4/include/linux/fcntl.h 2004-03-19 17:52:49.000000000 +01=
00
@@ -23,6 +23,9 @@
#define DN_ATTRIB 0x00000020 /* File changed attibutes */
#define DN_MULTISHOT 0x80000000 /* Don't remove notifier */
=20
+#define F_SETCOW (F_LINUX_SPECIFIC_BASE+3)
+#define F_GETCOW (F_LINUX_SPECIFIC_BASE+4)
+
#ifdef __KERNEL__
=20
#if BITS_PER_LONG =3D=3D 32
--- linux-2.6.4/include/linux/fs.h~cowlink 2004-03-19 17:47:29.00000000=
0 +0100
+++ linux-2.6.4/include/linux/fs.h 2004-03-19 17:52:49.000000000 +0100
@@ -137,6 +137,9 @@
#define S_DEAD 32 /* removed, but still open directory */
#define S_NOQUOTA 64 /* Inode is not counted to quota */
#define S_DIRSYNC 128 /* Directory modifications are synchronous */
+#define S_COWLINK 256 /* Hard links have copy on write semantics.
+ * This flag has no meaning for directories,
+ * but is inherited to directory children */
=20
/*
* Note that nosuid etc flags are inode-specific: setting some file-sy=
stem
--- linux-2.6.4/fs/fcntl.c~cowlink 2004-03-19 17:47:15.000000000 +0100
+++ linux-2.6.4/fs/fcntl.c 2004-03-20 13:29:29.000000000 +0100
@@ -282,6 +282,20 @@
=20
EXPORT_SYMBOL(f_delown);
=20
+static long fcntl_setcow(struct file *filp, unsigned long arg)
+{
+ struct inode *inode =3D filp->f_dentry->d_inode;
+
+ spin_lock(&inode->i_lock);
+ if (arg)
+ inode->i_flags |=3D S_COWLINK;
+ else
+ inode->i_flags &=3D ~S_COWLINK;
+ spin_unlock(&inode->i_lock);
+ mark_inode_dirty_sync(inode);
+ return 0;
+}
+
static long do_fcntl(unsigned int fd, unsigned int cmd,
unsigned long arg, struct file * filp)
{
@@ -346,6 +360,13 @@
case F_NOTIFY:
err =3D fcntl_dirnotify(fd, filp, arg);
break;
+ case F_SETCOW:
+ err =3D fcntl_setcow(filp, arg);
+ break;
+ case F_GETCOW:
+ err =3D (filp->f_dentry->d_inode->i_flags & S_COWLINK) /
+ S_COWLINK;
+ break;
default:
break;
}
--- linux-2.6.4/fs/namei.c~cowlink 2004-03-19 17:47:19.000000000 +0100
+++ linux-2.6.4/fs/namei.c 2004-03-20 11:59:59.000000000 +0100
@@ -223,6 +223,40 @@
return security_inode_permission(inode, mask, nd);
}
=20
+static inline void set_cowflag(struct inode *inode)
+{
+ spin_lock(&inode->i_lock);
+ inode->i_flags |=3D S_COWLINK;
+ spin_unlock(&inode->i_lock);
+}
+
+/*
+ * Files with the S_COWLINK flag set cannot be written to, if more
+ * than one hard link to them exists. Ultimately, this function
+ * should copy the inode, assign the copy to the dentry and lower use
+ * count of the old inode - one day.
+ * For now, it is sufficient to return an error and let userspace
+ * deal with the messy part. Not exactly the meaning of
+ * copy-on-write, but much better than writing to fifty files at once
+ * and noticing month later.
+ *
+ * Yes, this breaks the kernel interface and is simply wrong. This
+ * is intended behaviour, so Linus will not merge the code before
+ * it is complete. Or will he?
+ */
+static int break_cow_link(struct inode *inode)
+{
+ if (!(inode->i_flags & S_COWLINK))
+ return 0;
+ if (!S_ISREG(inode->i_mode))
+ return 0;
+ if (inode->i_nlink < 2)
+ return 0;
+ /* TODO: As soon as sendfile can do normal file copies, use that
+ * and always return 0 */
+ return -EMLINK;
+}
+
/*
* get_write_access() gets write permission for a file.
* put_write_access() releases this write permission.
@@ -243,7 +277,14 @@
=20
int get_write_access(struct inode * inode)
{
+ int error;
+
spin_lock(&inode->i_lock);
+ error =3D break_cow_link(inode);
+ if (error) {
+ spin_unlock(&inode->i_lock);
+ return error;
+ }
if (atomic_read(&inode->i_writecount) < 0) {
spin_unlock(&inode->i_lock);
return -ETXTBSY;
@@ -1148,6 +1189,10 @@
if (!error) {
inode_dir_notify(dir, DN_CREATE);
security_inode_post_create(dir, dentry, mode);
+ if (dir->i_flags & S_COWLINK) {
+ set_cowflag(dentry->d_inode);
+ mark_inode_dirty_sync(dentry->d_inode);
+ }
}
return error;
}
@@ -1522,6 +1567,9 @@
if (!error) {
inode_dir_notify(dir, DN_CREATE);
security_inode_post_mkdir(dir,dentry, mode);
+ if (dir->i_flags & S_COWLINK) {
+ set_cowflag(dentry->d_inode);
+ }
}
return error;
}
@@ -1820,6 +1868,13 @@
return -EXDEV;
=20
/*
+ * Cowlink attribute is inherited from directory, but here,
+ * the inode already has one. If they don't match, bail out.
+ */
+ if ((dir->i_flags ^ old_dentry->d_inode->i_flags) & S_COWLINK)
+ return -EMLINK;
+
+ /*
* A link to an append-only or immutable file cannot be created.
*/
if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
@@ -1997,6 +2052,24 @@
return error;
}
=20
+static int cow_allow_rename(struct inode *old_dir, struct dentry *old_=
dentry,
+ struct inode *new_dir)
+{
+ /* source and target share directory: allow */
+ if (old_dir =3D=3D new_dir)
+ return 0;
+ /* source and target directory have identical cowlink flag: allow */
+ if (! ((old_dentry->d_inode->i_flags ^ new_dir->i_flags) & S_COWLINK)=
)
+ return 0;
+ /* We could always fail here, but cowlink flag is only defined for
+ * files and directories, so let's allow special files */
+ if (!S_ISREG(old_dentry->d_inode->i_mode))
+ return -EMLINK;
+ if (!S_ISDIR(old_dentry->d_inode->i_mode))
+ return -EMLINK;
+ return 0;
+}
+
int vfs_rename(struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry)
{
@@ -2020,6 +2093,10 @@
if (!old_dir->i_op || !old_dir->i_op->rename)
return -EPERM;
=20
+ error =3D cow_allow_rename(old_dir, old_dentry, new_dir);
+ if (error)
+ return error;
+
DQUOT_INIT(old_dir);
DQUOT_INIT(new_dir);
=20
--- linux-2.6.4/fs/ext2/inode.c~cowlink 2004-03-19 17:44:02.000000000 +=
0100
+++ linux-2.6.4/fs/ext2/inode.c 2004-03-19 17:52:49.000000000 +0100
@@ -1020,6 +1020,7 @@
{
unsigned int flags =3D EXT2_I(inode)->i_flags;
=20
+ inode->i_flags =3D flags;
inode->i_flags &=3D ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC=
);
if (flags & EXT2_SYNC_FL)
inode->i_flags |=3D S_SYNC;
@@ -1191,7 +1192,7 @@
=20
raw_inode->i_blocks =3D cpu_to_le32(inode->i_blocks);
raw_inode->i_dtime =3D cpu_to_le32(ei->i_dtime);
- raw_inode->i_flags =3D cpu_to_le32(ei->i_flags);
+ raw_inode->i_flags =3D cpu_to_le32(inode->i_flags);
raw_inode->i_faddr =3D cpu_to_le32(ei->i_faddr);
raw_inode->i_frag =3D ei->i_frag_no;
raw_inode->i_fsize =3D ei->i_frag_size;
--- linux-2.6.4/fs/ext3/inode.c~cowlink 2004-03-19 17:44:02.000000000 +=
0100
+++ linux-2.6.4/fs/ext3/inode.c 2004-03-19 17:52:49.000000000 +0100
@@ -2447,6 +2447,7 @@
{
unsigned int flags =3D EXT3_I(inode)->i_flags;
=20
+ inode->i_flags =3D flags;
inode->i_flags &=3D ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC=
);
if (flags & EXT3_SYNC_FL)
inode->i_flags |=3D S_SYNC;
@@ -2629,7 +2630,8 @@
raw_inode->i_mtime =3D cpu_to_le32(inode->i_mtime.tv_sec);
raw_inode->i_blocks =3D cpu_to_le32(inode->i_blocks);
raw_inode->i_dtime =3D cpu_to_le32(ei->i_dtime);
- raw_inode->i_flags =3D cpu_to_le32(ei->i_flags);
+ raw_inode->i_flags =3D cpu_to_le32((ei->i_flags & ~S_COWLINK) |
+ (inode->i_flags & S_COWLINK));
#ifdef EXT3_FRAGMENTS
raw_inode->i_faddr =3D cpu_to_le32(ei->i_faddr);
raw_inode->i_frag =3D ei->i_frag_no;
Andrew Morton
2004-03-20 19:28:28 UTC
Permalink
Post by Jörn Engel
i_lock is an innermost lock. No locks should be taken inside i_loc=
k.
Post by Jörn Engel
=20
Here, not only is inode_lock being taken inside i_lock but ->dirty=
_inode
Post by Jörn Engel
may be called as well, and dirty_inode() may not be called under a=
ny
Post by Jörn Engel
spinlock.
=20
Is it enough to move the mark_inode_dirty outside of i_lock?
yup. Now you're using i_lock to protect i_flags (which seems reasonabl=
e)
you'll need to hnt down all the other i_flags users and make them use
i_lock too. Currently they appear to be using i_sem.
Jörn Engel
2004-03-21 12:43:02 UTC
Permalink
=20
yup. Now you're using i_lock to protect i_flags (which seems reasona=
ble)
you'll need to hnt down all the other i_flags users and make them use
i_lock too. Currently they appear to be using i_sem.
Interesting task. A few of those users are filesystems that fill
their own flags into inode->i_flags. Here is an example from
fs/ext3/ialloc.c:

static int find_group_orlov(struct super_block *sb, struct inode *paren=
t)
{
...
if ((parent =3D=3D sb->s_root->d_inode) ||
(parent->i_flags & EXT3_TOPDIR_FL)) {

Shouldn't this rather be:

if ((parent =3D=3D sb->s_root->d_inode) ||
(EXT3_I(parent)->i_flags & EXT3_TOPDIR_FL)) {

J=F6rn

--=20
And spam is a useful source of entropy for /dev/random too!
-- Jasmine Strong
Jörn Engel
2004-03-21 18:53:13 UTC
Permalink
=20
yup. Now you're using i_lock to protect i_flags (which seems reasona=
ble)
you'll need to hnt down all the other i_flags users and make them use
i_lock too. Currently they appear to be using i_sem.
Or in some cases nothing at all. Here is the patch against 2.6.4,
sans five non-trivial cases. I'll have to take a closer look at those
first.

J=F6rn

--=20
To recognize individual spam features you have to try to get into the
mind of the spammer, and frankly I want to spend as little time inside
the minds of spammers as possible.
-- Paul Graham

Omissions:
o fs/befs/linuxvfs.c abused i_flags, needs closer inspection
o fs/ext2/ialloc.c dito
o fs/ext3/ialloc.c dito
o fs/smbfs/file.c debug output - harmless
o fs/dquot.c vfs_quota_on() is racy, needs a better fix

drivers/usb/core/inode.c | 2 ++
fs/dquot.c | 9 ++++++++-
fs/ext2/ialloc.c | 2 ++
fs/ext2/inode.c | 2 ++
fs/ext3/ialloc.c | 2 ++
fs/ext3/inode.c | 2 ++
fs/fat/inode.c | 5 ++++-
fs/hfs/inode.c | 2 +-
fs/hfsplus/catalog.c | 4 ++--
fs/hfsplus/dir.c | 10 ++++++++--
fs/hfsplus/inode.c | 6 +++++-
fs/hfsplus/ioctl.c | 2 ++
fs/intermezzo/vfs.c | 7 ++++++-
fs/namei.c | 10 ++++++++--
fs/nfs/inode.c | 2 ++
fs/proc/base.c | 4 ++++
fs/reiserfs/inode.c | 9 ++++++++-
fs/udf/ialloc.c | 2 ++
fs/ufs/ialloc.c | 2 ++
fs/xfs/linux/xfs_ioctl.c | 2 +-
fs/xfs/linux/xfs_super.c | 2 ++
fs/xfs/linux/xfs_vnode.c | 2 ++
fs/xfs/xfs_acl.c | 2 +-
fs/xfs/xfs_cap.c | 2 +-
include/linux/fs.h | 30 ++++++++++++++++++++----------
25 files changed, 99 insertions(+), 25 deletions(-)

--- linux-2.6.4/include/linux/fs.h~lock_flags 2004-03-21 17:43:58.00000=
0000 +0100
+++ linux-2.6.4/include/linux/fs.h 2004-03-21 17:50:38.000000000 +0100
@@ -153,23 +153,24 @@
*/
#define __IS_FLG(inode,flg) ((inode)->i_sb->s_flags & (flg))
=20
-#define IS_RDONLY(inode) ((inode)->i_sb->s_flags & MS_RDONLY)
+#define IS_RDONLY(inode) (__IS_FLG(inode, MS_RDONLY))
#define IS_SYNC(inode) (__IS_FLG(inode, MS_SYNCHRONOUS) || \
- ((inode)->i_flags & S_SYNC))
+ inode_flags((inode), S_SYNC))
#define IS_DIRSYNC(inode) (__IS_FLG(inode, MS_SYNCHRONOUS|MS_DIRSYNC) =
|| \
- ((inode)->i_flags & (S_SYNC|S_DIRSYNC)))
+ inode_flags((inode), (S_SYNC|S_DIRSYNC)))
#define IS_MANDLOCK(inode) __IS_FLG(inode, MS_MANDLOCK)
=20
-#define IS_QUOTAINIT(inode) ((inode)->i_flags & S_QUOTA)
-#define IS_NOQUOTA(inode) ((inode)->i_flags & S_NOQUOTA)
-#define IS_APPEND(inode) ((inode)->i_flags & S_APPEND)
-#define IS_IMMUTABLE(inode) ((inode)->i_flags & S_IMMUTABLE)
-#define IS_NOATIME(inode) (__IS_FLG(inode, MS_NOATIME) || ((inode)->i_=
flags & S_NOATIME))
+#define IS_QUOTAINIT(inode) (inode_flags((inode), S_QUOTA))
+#define IS_NOQUOTA(inode) (inode_flags((inode), S_NOQUOTA))
+#define IS_APPEND(inode) (inode_flags((inode), S_APPEND))
+#define IS_IMMUTABLE(inode) (inode_flags((inode), S_IMMUTABLE))
+#define IS_NOATIME(inode) (__IS_FLG(inode, MS_NOATIME) || \
+ inode_flags((inode), S_NOATIME))
#define IS_NODIRATIME(inode) __IS_FLG(inode, MS_NODIRATIME)
#define IS_POSIXACL(inode) __IS_FLG(inode, MS_POSIXACL)
#define IS_ONE_SECOND(inode) __IS_FLG(inode, MS_ONE_SECOND)
=20
-#define IS_DEADDIR(inode) ((inode)->i_flags & S_DEAD)
+#define IS_DEADDIR(inode) (inode_flags((inode), S_DEAD))
=20
/* the read-only stuff doesn't really belong here, but any other place=
is
probably as bad and I don't want to create yet another include file=
=2E */
@@ -414,7 +415,7 @@
=20
unsigned long i_state;
=20
- unsigned int i_flags;
+ unsigned int i_flags; /* protected by i_lock */
unsigned char i_sock;
=20
atomic_t i_writecount;
@@ -428,6 +429,15 @@
#endif
};
=20
+static inline unsigned int inode_flags(struct inode *inode, unsigned i=
nt mask)
+{
+ int ret;
+ spin_lock(inode->i_lock);
+ ret =3D inode->i_flags & mask;
+ spin_unlock(inode->i_lock);
+ return ret;
+}
+
/*
* NOTE: in a 32bit arch with a preemptable kernel and
* an UP compile the i_size_read/write must be atomic
--- linux-2.6.4/drivers/usb/core/inode.c~lock_flags 2004-03-21 17:43:58=
=2E000000000 +0100
+++ linux-2.6.4/drivers/usb/core/inode.c 2004-03-21 17:44:03.000000000 =
+0100
@@ -277,7 +277,9 @@
if (usbfs_empty(dentry)) {
dentry->d_inode->i_nlink -=3D 2;
dput(dentry);
+ spin_lock(inode->i_lock);
inode->i_flags |=3D S_DEAD;
+ spin_unlock(inode->i_lock);
dir->i_nlink--;
error =3D 0;
}
--- linux-2.6.4/fs/xfs/linux/xfs_vnode.c~lock_flags 2004-03-21 17:43:58=
=2E000000000 +0100
+++ linux-2.6.4/fs/xfs/linux/xfs_vnode.c 2004-03-21 17:44:03.000000000 =
+0100
@@ -213,6 +213,7 @@
inode->i_mtime =3D va.va_mtime;
inode->i_ctime =3D va.va_ctime;
inode->i_atime =3D va.va_atime;
+ spin_lock(inode->i_lock);
if (va.va_xflags & XFS_XFLAG_IMMUTABLE)
inode->i_flags |=3D S_IMMUTABLE;
else
@@ -229,6 +230,7 @@
inode->i_flags |=3D S_NOATIME;
else
inode->i_flags &=3D ~S_NOATIME;
+ spin_unlock(inode->i_lock);
VUNMODIFY(vp);
}
return -error;
--- linux-2.6.4/fs/xfs/linux/xfs_ioctl.c~lock_flags 2004-03-21 17:43:58=
=2E000000000 +0100
+++ linux-2.6.4/fs/xfs/linux/xfs_ioctl.c 2004-03-21 17:44:03.000000000 =
+0100
@@ -879,7 +879,7 @@
int attr_flags =3D 0;
int error;
=20
- if (vp->v_inode.i_flags & (S_IMMUTABLE|S_APPEND))
+ if (inode_flags(vp->v_inode, (S_IMMUTABLE|S_APPEND)))
return -XFS_ERROR(EPERM);
=20
if (filp->f_flags & O_RDONLY)
--- linux-2.6.4/fs/xfs/linux/xfs_super.c~lock_flags 2004-03-21 17:43:58=
=2E000000000 +0100
+++ linux-2.6.4/fs/xfs/linux/xfs_super.c 2004-03-21 17:44:03.000000000 =
+0100
@@ -187,6 +187,7 @@
inode->i_mtime.tv_nsec =3D ip->i_d.di_mtime.t_nsec;
inode->i_ctime.tv_sec =3D ip->i_d.di_ctime.t_sec;
inode->i_ctime.tv_nsec =3D ip->i_d.di_ctime.t_nsec;
+ spin_lock(inode->i_lock);
if (ip->i_d.di_flags & XFS_DIFLAG_IMMUTABLE)
inode->i_flags |=3D S_IMMUTABLE;
else
@@ -203,6 +204,7 @@
inode->i_flags |=3D S_NOATIME;
else
inode->i_flags &=3D ~S_NOATIME;
+ spin_unlock(inode->i_lock);
vp->v_flag &=3D ~VMODIFIED;
}
=20
--- linux-2.6.4/fs/xfs/xfs_acl.c~lock_flags 2004-03-21 17:43:58.0000000=
00 +0100
+++ linux-2.6.4/fs/xfs/xfs_acl.c 2004-03-21 17:44:03.000000000 +0100
@@ -388,7 +388,7 @@
vattr_t va;
int error;
=20
- if (vp->v_inode.i_flags & (S_IMMUTABLE|S_APPEND))
+ if (inode_flags(&vp->v_inode, (S_IMMUTABLE|S_APPEND)))
return EPERM;
if (kind =3D=3D _ACL_TYPE_DEFAULT && vp->v_type !=3D VDIR)
return ENOTDIR;
--- linux-2.6.4/fs/xfs/xfs_cap.c~lock_flags 2004-03-21 17:43:58.0000000=
00 +0100
+++ linux-2.6.4/fs/xfs/xfs_cap.c 2004-03-21 17:44:03.000000000 +0100
@@ -192,7 +192,7 @@
=20
if (vp->v_vfsp->vfs_flag & VFS_RDONLY)
return EROFS;
- if (vp->v_inode.i_flags & (S_IMMUTABLE|S_APPEND))
+ if (inode_flags(&vp->v_inode, (S_IMMUTABLE|S_APPEND)))
return EPERM;
if ((error =3D _MAC_VACCESS(vp, NULL, VWRITE)))
return error;
--- linux-2.6.4/fs/nfs/inode.c~lock_flags 2004-03-21 17:43:58.000000000=
+0100
+++ linux-2.6.4/fs/nfs/inode.c 2004-03-21 17:44:03.000000000 +0100
@@ -729,7 +729,9 @@
inode->i_ino =3D hash;
=20
/* We can't support update_atime(), since the server will reset it *=
/
+ spin_lock(inode->i_lock);
inode->i_flags |=3D S_NOATIME;
+ spin_unlock(inode->i_lock);
inode->i_mode =3D fattr->mode;
/* Why so? Because we want revalidate for devices/FIFOs, and
* that's precisely what we have in nfs_file_inode_operations.
--- linux-2.6.4/fs/ufs/ialloc.c~lock_flags 2004-03-21 17:43:58.00000000=
0 +0100
+++ linux-2.6.4/fs/ufs/ialloc.c 2004-03-21 17:44:03.000000000 +0100
@@ -283,7 +283,9 @@
=20
if (DQUOT_ALLOC_INODE(inode)) {
DQUOT_DROP(inode);
+ spin_lock(inode->i_lock);
inode->i_flags |=3D S_NOQUOTA;
+ spin_unlock(inode->i_lock);
inode->i_nlink =3D 0;
iput(inode);
return ERR_PTR(-EDQUOT);
--- linux-2.6.4/fs/hfs/inode.c~lock_flags 2004-03-21 17:43:58.000000000=
+0100
+++ linux-2.6.4/fs/hfs/inode.c 2004-03-21 18:50:48.000000000 +0100
@@ -540,7 +540,7 @@
if (atomic_dec_and_test(&HFS_I(inode)->opencnt)) {
down(&inode->i_sem);
hfs_file_truncate(inode);
- //if (inode->i_flags & S_DEAD) {
+ //if (IS_DEADDIR(inode) {
// hfs_delete_cat(inode->i_ino, HFSPLUS_SB(sb).hidden_dir, NULL);
// hfs_delete_inode(inode);
//}
--- linux-2.6.4/fs/intermezzo/vfs.c~lock_flags 2004-03-21 17:43:58.0000=
00000 +0100
+++ linux-2.6.4/fs/intermezzo/vfs.c 2004-03-21 18:45:10.000000000 +0100
@@ -1445,7 +1445,9 @@
error =3D iops->rmdir(dir->d_inode, dentry);
unlock_kernel();
if (!error) {
+ spin_lock(dentry->d_inode->i_lock);
dentry->d_inode->i_flags |=3D S_DEAD;
+ spin_unlock(dentry->d_inode->i_lock);
error =3D presto_settime(fset, NULL, NULL, dir=
, info,
ATTR_CTIME | ATTR_MTIME=
);
}
@@ -1842,8 +1844,11 @@
error =3D do_rename(fset, old_parent, old_dentry,
new_parent, new_dentry, info)=
;
if (target) {
- if (!error)
+ if (!error) {
+ spin_lock(target);
target->i_flags |=3D S_DEAD;
+ spin_unlock(target);
+ }
// triple_up(&old_dir->i_zombie,
// &new_dir->i_zombie,
// &target->i_zombie);
--- linux-2.6.4/fs/udf/ialloc.c~lock_flags 2004-03-21 17:43:58.00000000=
0 +0100
+++ linux-2.6.4/fs/udf/ialloc.c 2004-03-21 17:44:03.000000000 +0100
@@ -165,7 +165,9 @@
if (DQUOT_ALLOC_INODE(inode))
{
DQUOT_DROP(inode);
+ spin_lock(inode->i_lock);
inode->i_flags |=3D S_NOQUOTA;
+ spin_unlock(inode->i_lock);
inode->i_nlink =3D 0;
iput(inode);
*err =3D -EDQUOT;
--- linux-2.6.4/fs/reiserfs/inode.c~lock_flags 2004-03-21 17:43:58.0000=
00000 +0100
+++ linux-2.6.4/fs/reiserfs/inode.c 2004-03-21 17:44:03.000000000 +0100
@@ -1612,8 +1612,11 @@
/* uid and gid must already be set by the caller for quota init */
=20
/* symlink cannot be immutable or append only, right? */
- if( S_ISLNK( inode -> i_mode ) )
+ if( S_ISLNK( inode -> i_mode ) ) {
+ spin_lock(inode->i_lock);
inode -> i_flags &=3D ~ ( S_IMMUTABLE | S_APPEND );
+ spin_unlock(inode->i_lock);
+ }
=20
inode->i_mtime =3D inode->i_atime =3D inode->i_ctime =3D CURRENT_T=
IME;
inode->i_size =3D i_size;
@@ -2287,6 +2290,7 @@
void sd_attrs_to_i_attrs( __u16 sd_attrs, struct inode *inode )
{
if( reiserfs_attrs( inode -> i_sb ) ) {
+ spin_lock(inode->i_lock);
if( sd_attrs & REISERFS_SYNC_FL )
inode -> i_flags |=3D S_SYNC;
else
@@ -2307,12 +2311,14 @@
REISERFS_I(inode)->i_flags |=3D i_nopack_mask;
else
REISERFS_I(inode)->i_flags &=3D ~i_nopack_mask;
+ spin_unlock(inode->i_lock);
}
}
=20
void i_attrs_to_sd_attrs( struct inode *inode, __u16 *sd_attrs )
{
if( reiserfs_attrs( inode -> i_sb ) ) {
+ spin_lock(inode->i_lock);
if( inode -> i_flags & S_IMMUTABLE )
*sd_attrs |=3D REISERFS_IMMUTABLE_FL;
else
@@ -2329,6 +2335,7 @@
*sd_attrs |=3D REISERFS_NOTAIL_FL;
else
*sd_attrs &=3D ~REISERFS_NOTAIL_FL;
+ spin_unlock(inode->i_lock);
}
}
=20
--- linux-2.6.4/fs/fat/inode.c~lock_flags 2004-03-21 17:43:58.000000000=
+0100
+++ linux-2.6.4/fs/fat/inode.c 2004-03-21 17:44:03.000000000 +0100
@@ -1192,8 +1192,11 @@
MSDOS_I(inode)->mmu_private =3D inode->i_size;
}
if(de->attr & ATTR_SYS)
- if (sbi->options.sys_immutable)
+ if (sbi->options.sys_immutable) {
+ spin_lock(inode->i_lock);
inode->i_flags |=3D S_IMMUTABLE;
+ spin_unlock(inode->i_lock);
+ }
MSDOS_I(inode)->i_attrs =3D de->attr & ATTR_UNUSED;
/* this is as close to the truth as we can get ... */
inode->i_blksize =3D sbi->cluster_size;
--- linux-2.6.4/fs/ext3/ialloc.c~lock_flags 2004-03-21 17:43:58.0000000=
00 +0100
+++ linux-2.6.4/fs/ext3/ialloc.c 2004-03-21 17:44:03.000000000 +0100
@@ -627,7 +627,9 @@
return ret;
=20
fail2:
+ spin_lock(inode->i_lock);
inode->i_flags |=3D S_NOQUOTA;
+ spin_unlock(inode->i_lock);
inode->i_nlink =3D 0;
iput(inode);
brelse(bitmap_bh);
--- linux-2.6.4/fs/ext3/inode.c~lock_flags 2004-03-21 17:43:58.00000000=
0 +0100
+++ linux-2.6.4/fs/ext3/inode.c 2004-03-21 17:44:03.000000000 +0100
@@ -2447,6 +2447,7 @@
{
unsigned int flags =3D EXT3_I(inode)->i_flags;
=20
+ spin_lock(inode->i_lock);
inode->i_flags &=3D ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC=
);
if (flags & EXT3_SYNC_FL)
inode->i_flags |=3D S_SYNC;
@@ -2458,6 +2459,7 @@
inode->i_flags |=3D S_NOATIME;
if (flags & EXT3_DIRSYNC_FL)
inode->i_flags |=3D S_DIRSYNC;
+ spin_unlock(inode->i_lock);
}
=20
void ext3_read_inode(struct inode * inode)
--- linux-2.6.4/fs/proc/base.c~lock_flags 2004-03-21 17:43:58.000000000=
+0100
+++ linux-2.6.4/fs/proc/base.c 2004-03-21 17:44:03.000000000 +0100
@@ -1594,7 +1594,9 @@
inode->i_op =3D &proc_tgid_base_inode_operations;
inode->i_fop =3D &proc_tgid_base_operations;
inode->i_nlink =3D 3;
+ spin_lock(inode->i_lock);
inode->i_flags|=3DS_IMMUTABLE;
+ spin_unlock(inode->i_lock);
=20
dentry->d_op =3D &pid_base_dentry_operations;
=20
@@ -1649,7 +1651,9 @@
inode->i_op =3D &proc_tid_base_inode_operations;
inode->i_fop =3D &proc_tid_base_operations;
inode->i_nlink =3D 3;
+ spin_lock(inode->i_lock);
inode->i_flags|=3DS_IMMUTABLE;
+ spin_unlock(inode->i_lock);
=20
dentry->d_op =3D &pid_base_dentry_operations;
=20
--- linux-2.6.4/fs/ext2/inode.c~lock_flags 2004-03-21 17:43:58.00000000=
0 +0100
+++ linux-2.6.4/fs/ext2/inode.c 2004-03-21 17:44:03.000000000 +0100
@@ -1020,6 +1020,7 @@
{
unsigned int flags =3D EXT2_I(inode)->i_flags;
=20
+ spin_lock(inode->i_lock);
inode->i_flags &=3D ~(S_SYNC|S_APPEND|S_IMMUTABLE|S_NOATIME|S_DIRSYNC=
);
if (flags & EXT2_SYNC_FL)
inode->i_flags |=3D S_SYNC;
@@ -1031,6 +1032,7 @@
inode->i_flags |=3D S_NOATIME;
if (flags & EXT2_DIRSYNC_FL)
inode->i_flags |=3D S_DIRSYNC;
+ spin_unlock(inode->i_lock);
}
=20
void ext2_read_inode (struct inode * inode)
--- linux-2.6.4/fs/ext2/ialloc.c~lock_flags 2004-03-21 17:43:58.0000000=
00 +0100
+++ linux-2.6.4/fs/ext2/ialloc.c 2004-03-21 18:43:39.000000000 +0100
@@ -620,7 +620,9 @@
return inode;
=20
fail2:
+ spin_lock(inode->i_lock);
inode->i_flags |=3D S_NOQUOTA;
+ spin_unlock(inode->i_lock);
inode->i_nlink =3D 0;
iput(inode);
return ERR_PTR(err);
--- linux-2.6.4/fs/namei.c~lock_flags 2004-03-21 17:43:58.000000000 +01=
00
+++ linux-2.6.4/fs/namei.c 2004-03-21 18:33:10.000000000 +0100
@@ -1609,8 +1609,11 @@
error =3D security_inode_rmdir(dir, dentry);
if (!error) {
error =3D dir->i_op->rmdir(dir, dentry);
- if (!error)
+ if (!error) {
+ spin_lock(dentry->d_inode->i_lock);
dentry->d_inode->i_flags |=3D S_DEAD;
+ spin_unlock(dentry->d_inode->i_lock);
+ }
}
}
up(&dentry->d_inode->i_sem);
@@ -1952,8 +1955,11 @@
else=20
error =3D old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_de=
ntry);
if (target) {
- if (!error)
+ if (!error) {
+ spin_lock(target->i_lock);
target->i_flags |=3D S_DEAD;
+ spin_unlock(target->i_lock);
+ }
up(&target->i_sem);
if (d_unhashed(new_dentry))
d_rehash(new_dentry);
--- linux-2.6.4/fs/dquot.c~lock_flags 2004-03-21 17:43:58.000000000 +01=
00
+++ linux-2.6.4/fs/dquot.c 2004-03-21 17:44:03.000000000 +0100
@@ -573,7 +573,9 @@
if (inode->i_dquot[cnt] !=3D NODQUOT)
goto put_it;
}
+ spin_lock(inode->i_lock);
inode->i_flags &=3D ~S_QUOTA;
+ spin_unlock(inode->i_lock);
put_it:
if (dquot !=3D NODQUOT) {
if (dqput_blocks(dquot)) {
@@ -824,8 +826,11 @@
break;
}
inode->i_dquot[cnt] =3D dqget(inode->i_sb, id, cnt);
- if (inode->i_dquot[cnt])
+ if (inode->i_dquot[cnt]) {
+ spin_lock(inode->i_lock);
inode->i_flags |=3D S_QUOTA;
+ spin_unlock(inode->i_lock);
+ }
}
}
up_write(&sb_dqopt(inode->i_sb)->dqptr_sem);
@@ -839,7 +844,9 @@
{
int cnt;
=20
+ spin_lock(inode->i_lock);
inode->i_flags &=3D ~S_QUOTA;
+ spin_unlock(inode->i_lock);
for (cnt =3D 0; cnt < MAXQUOTAS; cnt++) {
to_drop[cnt] =3D inode->i_dquot[cnt];
inode->i_dquot[cnt] =3D NODQUOT;
--- linux-2.6.4/fs/hfsplus/catalog.c~lock_flags 2004-03-21 17:43:58.000=
000000 +0100
+++ linux-2.6.4/fs/hfsplus/catalog.c 2004-03-21 18:49:52.000000000 +010=
0
@@ -54,11 +54,11 @@
=20
static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm=
*perms)
{
- if (inode->i_flags & S_IMMUTABLE)
+ if (IS_IMMUTABLE(inode))
perms->rootflags |=3D HFSPLUS_FLG_IMMUTABLE;
else
perms->rootflags &=3D ~HFSPLUS_FLG_IMMUTABLE;
- if (inode->i_flags & S_APPEND)
+ if (IS_APPEND(inode))
perms->rootflags |=3D HFSPLUS_FLG_APPEND;
else
perms->rootflags &=3D ~HFSPLUS_FLG_APPEND;
--- linux-2.6.4/fs/hfsplus/dir.c~lock_flags 2004-03-21 17:43:58.0000000=
00 +0100
+++ linux-2.6.4/fs/hfsplus/dir.c 2004-03-21 17:44:03.000000000 +0100
@@ -315,8 +315,11 @@
res =3D hfsplus_rename_cat(inode->i_ino,
dir, &dentry->d_name,
HFSPLUS_SB(sb).hidden_dir, &str);
- if (!res)
+ if (!res) {
+ spin_lock(inode->i_lock);
inode->i_flags |=3D S_DEAD;
+ spin_unlock(inode->i_lock);
+ }
return res;
}
res =3D hfsplus_delete_cat(cnid, dir, &dentry->d_name);
@@ -330,8 +333,11 @@
res =3D hfsplus_delete_cat(inode->i_ino, HFSPLUS_SB(sb).hidden_dir,=
NULL);
if (!res)
hfsplus_delete_inode(inode);
- } else
+ } else {
+ spin_lock(inode->i_lock);
inode->i_flags |=3D S_DEAD;
+ spin_unlock(inode->i_lock);
+ }
}
inode->i_ctime =3D CURRENT_TIME;
mark_inode_dirty(inode);
--- linux-2.6.4/fs/hfsplus/inode.c~lock_flags 2004-03-21 17:43:58.00000=
0000 +0100
+++ linux-2.6.4/fs/hfsplus/inode.c 2004-03-21 18:48:11.000000000 +0100
@@ -225,6 +225,7 @@
=20
HFSPLUS_I(inode).rootflags =3D perms->rootflags;
HFSPLUS_I(inode).userflags =3D perms->userflags;
+ spin_lock(inode->i_lock);
if (perms->rootflags & HFSPLUS_FLG_IMMUTABLE)
inode->i_flags |=3D S_IMMUTABLE;
else
@@ -233,10 +234,12 @@
inode->i_flags |=3D S_APPEND;
else
inode->i_flags &=3D ~S_APPEND;
+ spin_unlock(inode->i_lock);
}
=20
static void hfsplus_set_perms(struct inode *inode, struct hfsplus_perm=
*perms)
{
+ spin_lock(inode->i_lock);
if (inode->i_flags & S_IMMUTABLE)
perms->rootflags |=3D HFSPLUS_FLG_IMMUTABLE;
else
@@ -245,6 +248,7 @@
perms->rootflags |=3D HFSPLUS_FLG_APPEND;
else
perms->rootflags &=3D ~HFSPLUS_FLG_APPEND;
+ spin_unlock(inode->i_lock);
perms->userflags =3D HFSPLUS_I(inode).userflags;
perms->mode =3D cpu_to_be16(inode->i_mode);
perms->owner =3D cpu_to_be32(inode->i_uid);
@@ -285,7 +289,7 @@
if (atomic_dec_and_test(&HFSPLUS_I(inode).opencnt)) {
down(&inode->i_sem);
hfsplus_file_truncate(inode);
- if (inode->i_flags & S_DEAD) {
+ if (IS_DEADDIR(inode)) {
hfsplus_delete_cat(inode->i_ino, HFSPLUS_SB(sb).hidden_dir, NULL);
hfsplus_delete_inode(inode);
}
--- linux-2.6.4/fs/hfsplus/ioctl.c~lock_flags 2004-03-21 17:43:58.00000=
0000 +0100
+++ linux-2.6.4/fs/hfsplus/ioctl.c 2004-03-21 17:44:03.000000000 +0100
@@ -53,6 +53,7 @@
EXT2_FLAG_NODUMP))
return -EOPNOTSUPP;
=20
+ spin_lock(inode->i_lock);
if (flags & EXT2_FLAG_IMMUTABLE) { /* EXT2_IMMUTABLE_FL */
inode->i_flags |=3D S_IMMUTABLE;
HFSPLUS_I(inode).rootflags |=3D HFSPLUS_FLG_IMMUTABLE;
@@ -67,6 +68,7 @@
inode->i_flags &=3D ~S_APPEND;
HFSPLUS_I(inode).rootflags &=3D ~HFSPLUS_FLG_APPEND;
}
+ spin_unlock(inode->i_lock);
if (flags & EXT2_FLAG_NODUMP) /* EXT2_NODUMP_FL */
HFSPLUS_I(inode).userflags |=3D HFSPLUS_FLG_NODUMP;
else
Patrick J. LoPresti
2004-03-20 15:03:05 UTC
Permalink
Neat stuff! But...
Post by Jörn Engel
+ * Files with the S_COWLINK flag set cannot be written to, if more
+ * than one hard link to them exists. Ultimately, this function
+ * should copy the inode, assign the copy to the dentry and lower us=
e
Post by Jörn Engel
+ * count of the old inode - one day.
What happens if the disk fills while you are making the copy? Will
open(2) on an *existing file* then return ENOSPC?

I do not think you can implement this without changing the interface
to open(2). Which means applications have to be made aware of it
anyway. Which means you might as well leave your implementation as-is
and let userspace worry about creating the copy (and dealing with the
resulting errors).

- Pat
Jörn Engel
2004-03-20 15:23:28 UTC
Permalink
Post by Patrick J. LoPresti
=20
What happens if the disk fills while you are making the copy? Will
open(2) on an *existing file* then return ENOSPC?
Correct. It would be possible to always succeed and return -ENOSPC
on every write(). But then mmap() has the same problem again, so
serious headache would be the only gain from this little excercise.
Post by Patrick J. LoPresti
I do not think you can implement this without changing the interface
to open(2). Which means applications have to be made aware of it
anyway. Which means you might as well leave your implementation as-i=
s
Post by Patrick J. LoPresti
and let userspace worry about creating the copy (and dealing with the
resulting errors).
Good point. Vote is now 2:0 for the simple approach.

J=F6rn

--=20
"Translations are and will always be problematic. They inflict violence=
=20
upon two languages." (translation from German)
Pavel Machek
2004-03-29 17:12:45 UTC
Permalink
Hi!
Post by Jörn Engel
Post by Patrick J. LoPresti
What happens if the disk fills while you are making the copy? Will
open(2) on an *existing file* then return ENOSPC?
Correct. It would be possible to always succeed and return -ENOSPC
on every write(). But then mmap() has the same problem again, so
serious headache would be the only gain from this little excercise.
Post by Patrick J. LoPresti
I do not think you can implement this without changing the interface
to open(2). Which means applications have to be made aware of it
anyway. Which means you might as well leave your implementation as-is
and let userspace worry about creating the copy (and dealing with the
resulting errors).
Good point. Vote is now 2:0 for the simple approach.
Well, 99% need no special handling on ENOSPC during open just
now. However, implementing file copying to each one would be serious
headache.

Applications can not be sure that it is existing file. If you
do stat followed by open, someone may have removed the file in
between. So it is not so new case.
Pavel
--
When do you have a heart between your knees?
[Johanka's followup: and *two* hearts?]
Patrick J. LoPresti
2004-03-29 21:05:25 UTC
Permalink
Post by Pavel Machek
Post by Patrick J. LoPresti
What happens if the disk fills while you are making the copy? Will
open(2) on an *existing file* then return ENOSPC?
Applications can not be sure that it is existing file. If you
do stat followed by open, someone may have removed the file in
between. So it is not so new case.
I should have said, "Will open(2) without O_CREAT then return ENOSPC?"

This is definitely a new case.

For what it's worth, I agree with whoever (Jamie?) said that COW
should be primarily a space optimization, and that semantically the
two files should mostly behave like separate copies.

In fact, I think it is unfortunate, in some ways, that things like
permissions and timestamps are kept in the inode. This means that two
files may only be COW-linked if they also share ownership,
permissions, and timestamps, which makes COW links less useful for
some applications (e.g., sharing source trees among multiple
developers).

But sharing data blocks without sharing inodes is too horrible even to
contemplate, I suppose.

- Pat
Pavel Machek
2004-03-29 23:16:35 UTC
Permalink
Hi!
Post by Patrick J. LoPresti
Post by Pavel Machek
Post by Patrick J. LoPresti
What happens if the disk fills while you are making the copy? Will
open(2) on an *existing file* then return ENOSPC?
Applications can not be sure that it is existing file. If you
do stat followed by open, someone may have removed the file in
between. So it is not so new case.
I should have said, "Will open(2) without O_CREAT then return ENOSPC?"
This is definitely a new case.
For what it's worth, I agree with whoever (Jamie?) said that COW
should be primarily a space optimization, and that semantically the
two files should mostly behave like separate copies.
In fact, I think it is unfortunate, in some ways, that things like
permissions and timestamps are kept in the inode. This means that two
files may only be COW-linked if they also share ownership,
permissions, and timestamps, which makes COW links less useful for
some applications (e.g., sharing source trees among multiple
developers).
I think they *should* have separate permissions.

Also it should be possible to have file with 2 hardlinks cowlinked
somewhere, and possibly make more hardlinks of that one... Having
pointer to another inode in place where direct block pointers normally
are should be enough (thinking ext2 here).
Post by Patrick J. LoPresti
But sharing data blocks without sharing inodes is too horrible even to
contemplate, I suppose.
Why, btw?

Lets say we allocate 4 bits instead of one for block bitmap. Count
"15" is special, now it means "15 or higher". That means we have to
"garbage-collect" to free space that used to have more than 15 links,
but that should not happen too often...
Pavel
--
When do you have a heart between your knees?
[Johanka's followup: and *two* hearts?]
Jamie Lokier
2004-03-31 14:34:12 UTC
Permalink
Post by Pavel Machek
Also it should be possible to have file with 2 hardlinks cowlinked
somewhere, and possibly make more hardlinks of that one... Having
pointer to another inode in place where direct block pointers normally
are should be enough (thinking ext2 here).
Yes.
Post by Pavel Machek
Post by Patrick J. LoPresti
But sharing data blocks without sharing inodes is too horrible even to
contemplate, I suppose.
Why, btw?
Lets say we allocate 4 bits instead of one for block bitmap. Count
"15" is special, now it means "15 or higher". That means we have to
"garbage-collect" to free space that used to have more than 15 links,
but that should not happen too often...
The garbage collection is what's horrible about it :)
Btw, 15 would be exceeded easily in my home directory.

IMHO, an inode whose block pointers points to another, so that whole
files only can be shared, would be fine.

Only one level of indirection would be allowed, so there'd be no
loops, just reference counted shared inodes.

-- Jamie

ps. Sharing individual data blocks is rather complicated. If it were
a very desirable feature, I'd be inclined to go for reference counted
shared extents or reference counted shared indirection blocks
(i.e. sharing at the level or 4Mb or whatever the first indirection
block size is).
Pavel Machek
2004-03-31 14:45:37 UTC
Permalink
Hi!
Post by Jamie Lokier
Post by Pavel Machek
Also it should be possible to have file with 2 hardlinks cowlinked
somewhere, and possibly make more hardlinks of that one... Having
pointer to another inode in place where direct block pointers normally
are should be enough (thinking ext2 here).
Yes.
Post by Pavel Machek
Post by Patrick J. LoPresti
But sharing data blocks without sharing inodes is too horrible even to
contemplate, I suppose.
Why, btw?
Lets say we allocate 4 bits instead of one for block bitmap. Count
"15" is special, now it means "15 or higher". That means we have to
"garbage-collect" to free space that used to have more than 15 links,
but that should not happen too often...
The garbage collection is what's horrible about it :)
Btw, 15 would be exceeded easily in my home directory.
Well, but chances are that you'll never unlink such files... Leaving
garbage collection to fsck would make it rather easy.
Post by Jamie Lokier
IMHO, an inode whose block pointers points to another, so that whole
files only can be shared, would be fine.
Yes, this is probably way better way to do that.
Pavel
--
When do you have a heart between your knees?
[Johanka's followup: and *two* hearts?]
Jamie Lokier
2004-03-31 15:20:12 UTC
Permalink
Post by Pavel Machek
Post by Jamie Lokier
The garbage collection is what's horrible about it :)
Btw, 15 would be exceeded easily in my home directory.
Well, but chances are that you'll never unlink such files... Leaving
garbage collection to fsck would make it rather easy.
No, I'd unlink them often. Every time I clone a source tree, make
some changes, compile, maybe test :), then delete the tree.

I like to fsck maybe once every 6 months.

-- Jamie
Tim Connors
2004-04-02 11:44:28 UTC
Permalink
Post by Pavel Machek
Hi!
Post by Jamie Lokier
The garbage collection is what's horrible about it :)
Btw, 15 would be exceeded easily in my home directory.
Well, but chances are that you'll never unlink such files... Leaving
garbage collection to fsck would make it rather easy.
How often do you fsck? Sure, I fscked the other day, but that's
because some idiot unplugged my laptop.

I sure as hell delete copied trees more often than once every 6 months
though :)
--
TimC -- http://astronomy.swin.edu.au/staff/tconnors/
I'm sorry. The number you have reached is imaginary. Please rotate your
phone 90 degrees and try again.
Jörn Engel
2004-04-02 16:54:40 UTC
Permalink
Post by Pavel Machek
=20
I think they *should* have separate permissions.
That makes the count 2:2. I'll continue to follow the simple solution
for some time, but wouldn't like to have it included for now (or ever?)
Post by Pavel Machek
Also it should be possible to have file with 2 hardlinks cowlinked
somewhere, and possibly make more hardlinks of that one... Having
pointer to another inode in place where direct block pointers normall=
y
Post by Pavel Machek
are should be enough (thinking ext2 here).
All right, you are proposing hell. I've tried to think through all
possibilities and was too scared to continue. So limitation is that
cowlinks and hardlinks are mutually exclusive, which eliminated all
problems.

If you really want cowlinks and hardlinks to be intermixed freely, I'd
happily agree with you as soon as you can define the behaviour for all
possible cases in a simple document and none of them make me scared
again. Show me that it is possible and makes sense.

J=F6rn

--=20
A quarrel is quickly settled when deserted by one party; there is
no battle unless there be two.
-- Seneca
Pavel Machek
2004-04-02 18:01:28 UTC
Permalink
Hi!
Post by Jörn Engel
Post by Pavel Machek
Also it should be possible to have file with 2 hardlinks cowlinked
somewhere, and possibly make more hardlinks of that one... Having
pointer to another inode in place where direct block pointers normally
are should be enough (thinking ext2 here).
All right, you are proposing hell. I've tried to think through all
possibilities and was too scared to continue. So limitation is that
cowlinks and hardlinks are mutually exclusive, which eliminated all
problems.
I do not think I'm proposing hell.
Post by Jörn Engel
If you really want cowlinks and hardlinks to be intermixed freely, I'd
happily agree with you as soon as you can define the behaviour for all
possible cases in a simple document and none of them make me scared
again. Show me that it is possible and makes sense.
Okay:

User/kernel API modifications for cowlinks

open(..., O_RDWR) may return ENOSPACE

new syscall is introduced, copyfile(int fd_from, int fd_to). fd_to
must be empty, or it returns -EINVAL. copyfile() copies content of
file from one file to another. It may return success even through
there's not enough space on filesystem to actually do the copy. It is
also pretty fast.

another syscall is introduced for diff and friends, long long
get_data_id(int fd). It may only be used on non-zero-length regular
files. if get_data_id(fd1) == get_data_id(fd2), it means that files
fd1 and fd2 contain same data and you do not need to read them to
check it.

df might be overly optimistic

Implications

In my proposlal, diff will not automagically sense files contain same
stuff using (dev, ino); if you want speedups, you'll have to teach
diff to call get_data_id.

Users do not really know cowlinks exist. Disk space behaves funny, and
copyfile is somehow fast, but otherwise its normal UNIX system.

Trivial implementation does copyfile by real copying (=>slow, takes
lots of space), and returns error on get_data_id. (Or it might return
inode#, but returning error is probably better).

[And yes, I believe this can actually be implemented in usefull
way. Are you scared or should I continue?]

Pavel
--
When do you have a heart between your knees?
[Johanka's followup: and *two* hearts?]
Jörn Engel
2004-04-02 18:17:07 UTC
Permalink
Post by Pavel Machek
=20
I do not think I'm proposing hell.
Yet. :)
Post by Pavel Machek
If you really want cowlinks and hardlinks to be intermixed freely, =
I'd
Post by Pavel Machek
happily agree with you as soon as you can define the behaviour for =
all
Post by Pavel Machek
possible cases in a simple document and none of them make me scared
again. Show me that it is possible and makes sense.
=20
=20
User/kernel API modifications for cowlinks
=20
open(..., O_RDWR) may return ENOSPACE
=20
new syscall is introduced, copyfile(int fd_from, int fd_to). fd_to
must be empty, or it returns -EINVAL. copyfile() copies content of
file from one file to another. It may return success even through
there's not enough space on filesystem to actually do the copy. It is
also pretty fast.
=20
another syscall is introduced for diff and friends, long long
get_data_id(int fd). It may only be used on non-zero-length regular
files. if get_data_id(fd1) =3D=3D get_data_id(fd2), it means that fil=
es
Post by Pavel Machek
fd1 and fd2 contain same data and you do not need to read them to
check it.
Sounds good, but you missed the hell part.

What happens, if you copyfile() a file that has two links?
Then you link() the result.
Then you copyfile() one of those two links.
Then you link()...

Now you write to either one of the six files. What happens?

Give me a clean proposal how this is simple, defined and not too
dangerous for the unaware. Then I agree, there is no hell involved.

J=F6rn

--=20
When in doubt, use brute force.
-- Ken Thompson
Pavel Machek
2004-04-02 18:23:58 UTC
Permalink
Hi!
Post by Jörn Engel
Post by Pavel Machek
Post by Jörn Engel
If you really want cowlinks and hardlinks to be intermixed freely, I'd
happily agree with you as soon as you can define the behaviour for all
possible cases in a simple document and none of them make me scared
again. Show me that it is possible and makes sense.
User/kernel API modifications for cowlinks
open(..., O_RDWR) may return ENOSPACE
new syscall is introduced, copyfile(int fd_from, int fd_to). fd_to
must be empty, or it returns -EINVAL. copyfile() copies content of
file from one file to another. It may return success even through
there's not enough space on filesystem to actually do the copy. It is
also pretty fast.
another syscall is introduced for diff and friends, long long
get_data_id(int fd). It may only be used on non-zero-length regular
files. if get_data_id(fd1) == get_data_id(fd2), it means that files
fd1 and fd2 contain same data and you do not need to read them to
check it.
Sounds good, but you missed the hell part.
:-).
Post by Jörn Engel
Now you write to either one of the six files. What happens?
Give me a clean proposal how this is simple, defined and not too
dangerous for the unaware. Then I agree, there is no hell involved.
Okay, now I have to start talking about implementation. Assume ext2 as
a base. Theres new object "cowid" which contains, well, id for
get_data_id() and usage count. Each inode either has pointer to
"cowid" object, or it is plain old regular file.

INODE123 Usage count = 2.
Post by Jörn Engel
What happens, if you copyfile() a file that has two links?
copyfile results in:

INODE123 Usage count = 2, pointer to cowid 567
COWID 567: Usage count = 2
INODE124 Usage count = 1, pointer to cowid 567
Post by Jörn Engel
Then you link() the result.
No, problem just increase usage count on inode124:

INODE123 Usage count = 2, pointer to cowid 567
COWID 567: Usage count = 2
INODE124 Usage count = 2, pointer to cowid 567
Post by Jörn Engel
Then you copyfile() one of those two links.
Creates another inode pointing at same old cowid:

INODE123 Usage count = 2, pointer to cowid 567
COWID 567: Usage count = 3
INODE124 Usage count = 2, pointer to cowid 567
INODE125 Usage count = 1, pointer to cowid 567
Post by Jörn Engel
Then you link()...
INODE123 Usage count = 2, pointer to cowid 567
COWID 567: Usage count = 3
INODE124 Usage count = 2, pointer to cowid 567
INODE125 Usage count = 2, pointer to cowid 567

Now, if I write to any inode with has cowid, data have to be copied,
and pointer to cowid deleted from that inode .

Pavel
--
When do you have a heart between your knees?
[Johanka's followup: and *two* hearts?]
Ross Biro
2004-04-02 19:28:55 UTC
Permalink
Post by Jörn Engel
If you really want cowlinks and hardlinks to be intermixed freely, I'd
happily agree with you as soon as you can define the behaviour for all
possible cases in a simple document and none of them make me scared
again. Show me that it is possible and makes sense.
Maybe it's easiest to view the proposed copyfile() as being
semantically equivalent to cp from the point of view of anything above
the actual file system (modulo running out of space at weird times)

Then all the questions are easy to answer, and it would also be
possible to implement copyfile at the VFS layer as cp for file systems
that don't support it.

Of course, it gets more interesting if you try to do it at the block
level instead of at the file level. For ext2, you could just reserve
a block #, say -1, to mean take the data from the master cow file, and
anything else is treated normally. You would need a deamon to make
sure you were still saving space though.
Pavel Machek
2004-04-02 21:35:04 UTC
Permalink
Hi!
Post by Ross Biro
If you really want cowlinks and hardlinks to be intermixed fr=
eely, I'd
Post by Ross Biro
happily agree with you as soon as you can define the behaviou=
r for all
Post by Ross Biro
possible cases in a simple document and none of them make me =
scared
Post by Ross Biro
again. Show me that it is possible and makes sense.
=20
Maybe it's easiest to view the proposed copyfile() as being
semantically equivalent to cp from the point of view of anything abov=
e
Post by Ross Biro
the actual file system (modulo running out of space at weird times)
Yes, this is what I was trying to propose.
Pavel
--=20
When do you have a heart between your knees?
[Johanka's followup: and *two* hearts?]
Jörn Engel
2004-04-05 08:12:31 UTC
Permalink
Post by Ross Biro
=20
Of course, it gets more interesting if you try to do it at the block
level instead of at the file level. For ext2, you could just reserve
a block #, say -1, to mean take the data from the master cow file, an=
d
Post by Ross Biro
anything else is treated normally. You would need a deamon to make
sure you were still saving space though.
More interesting is correct. I see the advantages and proposed this
myself some time ago, but there are downsides. Basically, for each
block you need additional data, at least a counter telling you the
number of users it currently has. Eats up memory.

If it really has to make sense, you also have to detect duplicated
blocks at runtime. So you need a checksum for each block and a
balanced tree containing those checksums or some other means of quick
access. Eats up 40 bytes (16 checksum, 3*8 tree pointers). With 4k
blocks, that's 1% memory overhead.

Runtime is even worse. Unless the tree fits into memory, you have 1-3
disk reads for each write. Most filesystem developers don't like to
hear such news.

=46rankly, the disadvantages still outweigh the advantages today. With
64k blocks, more memory and more diskspace and in comparison less
unique blocks, it might make sense. Later. :)

J=F6rn

--=20
Ninety percent of everything is crap.
-- Sturgeon's Law
Pavel Machek
2004-04-05 08:19:08 UTC
Permalink
Hi!
Post by Jörn Engel
Post by Ross Biro
Of course, it gets more interesting if you try to do it at the block
level instead of at the file level. For ext2, you could just reserve
a block #, say -1, to mean take the data from the master cow file, and
anything else is treated normally. You would need a deamon to make
sure you were still saving space though.
More interesting is correct. I see the advantages and proposed this
myself some time ago, but there are downsides. Basically, for each
block you need additional data, at least a counter telling you the
number of users it currently has. Eats up memory.
If it really has to make sense, you also have to detect duplicated
blocks at runtime. So you need a checksum for each block and a
balanced tree containing those checksums or some other means of quick
access. Eats up 40 bytes (16 checksum, 3*8 tree pointers). With 4k
blocks, that's 1% memory overhead.
Well, you could do this in userspace, do it only if system is idle,
and use "scan" type approach.

But I agree that's far away.

BTW what about the "mix hardlinks with cowlinks" proposal? You said it
leads to hell and then I did not hear from you. Did it scare you that
much? ;-)
Pavel
--
When do you have a heart between your knees?
[Johanka's followup: and *two* hearts?]
Jörn Engel
2004-04-05 08:45:51 UTC
Permalink
=20
BTW what about the "mix hardlinks with cowlinks" proposal? You said i=
t
leads to hell and then I did not hear from you. Did it scare you that
much? ;-)
It did in the beginning. Over the weekend (without mail access) I
pretty much found the same solution you did, but without your
insistence, I hadn't even though about it. All glory to you! ;)

J=F6rn

--=20
A surrounded army must be given a way out.
-- Sun Tzu
Jamie Lokier
2004-04-02 20:09:21 UTC
Permalink
Post by Pavel Machek
Okay, now I have to start talking about implementation. Assume ext2 as
a base. Theres new object "cowid" which contains, well, id for
get_data_id() and usage count. Each inode either has pointer to
"cowid" object, or it is plain old regular file.
Pavel has it exactly right.

A simple way to store COWID objects in the filesystem itself is as
another ordinary inode. The attributes of that inode (mtime, mode
etc.) aren't important (except to fsck), only the size and data
pointers are important. The files which point to a COWID need a flag
to indicate that, too.

Someone said that a problem with using sendfile() to create cowlinks
is that sendfile() takes a length parameter. It's a size_t, which
isn't large enough to copy Large files.

Actually that isn't a problem. The COWID inodes contain the real
length of the shared data. It would be possible for the individual
cowlink inodes to have a smaller length, indicating that they don't
have all the data. Then a cp implementation which calls sendfile()
repeatedly could copy large files and still share the data. Provided
the offset and length are compatible with sharing, the first
sendfile() would create the COWID (if it doesn't exist already), and
subsequent ones would simply enlarge the individual cowlink's length
attribute. It's not the cleanest interface; I only mention it because
sendfile() exists already and could be used.

get_data_id() is one way to detect equivalent files. Another would be
a function files_equal(fd1, fd2) which returns a boolean.
get_data_id() has the advantage that it can report immediately whether
a file has _any_ cowlink peers, which is important for programs that
scan trees. Perhaps getxattr() would be reasonable interface, using a
named attribute "data-id".

-- Jamie
Pavel Machek
2004-04-02 21:39:34 UTC
Permalink
Hi!
Post by Jamie Lokier
Post by Pavel Machek
Okay, now I have to start talking about implementation. Assume ext2 as
a base. Theres new object "cowid" which contains, well, id for
get_data_id() and usage count. Each inode either has pointer to
"cowid" object, or it is plain old regular file.
Pavel has it exactly right.
A simple way to store COWID objects in the filesystem itself is as
another ordinary inode. The attributes of that inode (mtime, mode
etc.) aren't important (except to fsck), only the size and data
pointers are important. The files which point to a COWID need a flag
to indicate that, too.
Actually, my solution has one weirdness...
Post by Jamie Lokier
a
copyfile a b
rm a

...now b has pointer to cowid with usage count of 1. Which is slightly
ugly (and wastes one cowid entry), but should be harmless.
Post by Jamie Lokier
get_data_id() is one way to detect equivalent files. Another would be
a function files_equal(fd1, fd2) which returns a boolean.
files_equal(...) would lead to quadratic number of calls, no?
Post by Jamie Lokier
get_data_id() has the advantage that it can report immediately whether
a file has _any_ cowlink peers, which is important for programs that
scan trees. Perhaps getxattr() would be reasonable interface, using a
named attribute "data-id".
Yes, get_data_id() is extremely ugly name.
Pavel
--
When do you have a heart between your knees?
[Johanka's followup: and *two* hearts?]
Chris Friesen
2004-04-02 22:00:32 UTC
Permalink
Post by Pavel Machek
Actually, my solution has one weirdness...
a
copyfile a b
rm a
...now b has pointer to cowid with usage count of 1. Which is slightly
ugly (and wastes one cowid entry), but should be harmless.
Could you not change it back to a normal inode when refcount becomes 1?
Or if you didn't want to do that always (say if you knew there would
be more references being created soon) you could at least have some kind
of cleanup tool that you could manually run on a filesystem to clean it up?

Chris
Jamie Lokier
2004-04-03 00:49:47 UTC
Permalink
Post by Chris Friesen
Could you not change it back to a normal inode when refcount becomes 1?
You can only do that if the cowid object has a pointer to the last
remaining reference to it. That's possible, but more complicated and
would incur a little more I/O per cow operation.
Post by Chris Friesen
Or if you didn't want to do that always (say if you knew there would
be more references being created soon) you could at least have some kind
of cleanup tool that you could manually run on a filesystem to clean it up?
fsck could do it. It's not a big deal though: simply looking up the
inode through the last remaining path can also clean it up. Until
them, it's very little space used: the same as a short symlink.

-- Jamie
Pavel Machek
2004-04-03 08:23:03 UTC
Permalink
Hi!
Post by Jamie Lokier
Post by Chris Friesen
Could you not change it back to a normal inode when refcount becomes 1?
You can only do that if the cowid object has a pointer to the last
remaining reference to it. That's possible, but more complicated and
would incur a little more I/O per cow operation.
You'd have to have pointers to all references to it... because you
can't tell in advance which one will be the last to go away.

But I agree its not a big problem.
Pavel
--
When do you have a heart between your knees?
[Johanka's followup: and *two* hearts?]
Jamie Lokier
2004-04-03 13:15:39 UTC
Permalink
Post by Pavel Machek
Post by Jamie Lokier
Post by Chris Friesen
Could you not change it back to a normal inode when refcount becomes 1?
You can only do that if the cowid object has a pointer to the last
remaining reference to it. That's possible, but more complicated and
would incur a little more I/O per cow operation.
You'd have to have pointers to all references to it... because you
can't tell in advance which one will be the last to go away.
Exactly. Each of the cow pointers would need to be linked in a doubly
linked list containing them all.

-- Jamie
Jörn Engel
2004-04-05 08:19:26 UTC
Permalink
Post by Pavel Machek
Could you not change it back to a normal inode when refcount be=
comes 1?=20
Post by Pavel Machek
=20
You can only do that if the cowid object has a pointer to the las=
t
Post by Pavel Machek
remaining reference to it. That's possible, but more complicated=
and
Post by Pavel Machek
would incur a little more I/O per cow operation.
=20
You'd have to have pointers to all references to it... because you
can't tell in advance which one will be the last to go away.
=20
Exactly. Each of the cow pointers would need to be linked in a doubl=
y
linked list containing them all.
I don't like the list idea. Having the extra cowid (I prefer inode)
indirection costs a few bytes and one lookup, not much. The list is
way too much overhead to get rid of so little in a few cases.

If you really want to, create a new syscall foldfile() that will
remove the indirection for one file, if possible. Then userspace can
do the ugly work of scanning for single-linked cowids (or just leave
it).

J=F6rn

--=20
The competent programmer is fully aware of the strictly limited size of
his own skull; therefore he approaches the programming task in full
humility, and among other things he avoids clever tricks like the plagu=
e.=20
-- Edsger W. Dijkstra
Pavel Machek
2004-04-05 08:22:58 UTC
Permalink
Hi!
Post by Jörn Engel
Post by Jamie Lokier
Post by Pavel Machek
Post by Jamie Lokier
Post by Chris Friesen
Could you not change it back to a normal inode when refcount becomes 1?
You can only do that if the cowid object has a pointer to the last
remaining reference to it. That's possible, but more complicated and
would incur a little more I/O per cow operation.
You'd have to have pointers to all references to it... because you
can't tell in advance which one will be the last to go away.
Exactly. Each of the cow pointers would need to be linked in a doubly
linked list containing them all.
I don't like the list idea. Having the extra cowid (I prefer inode)
indirection costs a few bytes and one lookup, not much. The list is
way too much overhead to get rid of so little in a few cases.
Nobody likes the "list idea".
Post by Jörn Engel
If you really want to, create a new syscall foldfile() that will
remove the indirection for one file, if possible. Then userspace can
do the ugly work of scanning for single-linked cowids (or just leave
it).
We could automaticaly remove them when we see them, or on first open(
O_RDWR) or something, or just leave them alone. Definitely leave them
alone for first version.
Pavel
--
When do you have a heart between your knees?
[Johanka's followup: and *two* hearts?]
Jamie Lokier
2004-04-03 00:46:35 UTC
Permalink
Post by Pavel Machek
Actually, my solution has one weirdness...
Post by Jamie Lokier
a
copyfile a b
rm a
...now b has pointer to cowid with usage count of 1. Which is slightly
ugly (and wastes one cowid entry), but should be harmless.
That's necessary, unless the cowid object has a linked list of all the
inodes which point to it, a bit like inodes having a linke list of all
parent directories which point to them. That's not impossible, but
leaving the unnecessary cowid object is much simpler and will result
in less I/O (no doubly-linked list to update). It can be garbage
collected when the last reference is followed to it.
Post by Pavel Machek
Post by Jamie Lokier
get_data_id() is one way to detect equivalent files. Another would be
a function files_equal(fd1, fd2) which returns a boolean.
files_equal(...) would lead to quadratic number of calls, no?
Yes. </blush>

-- Jamie
Jamie Lokier
2004-04-03 01:04:25 UTC
Permalink
Here's a tricky situation:

1. A file is cowlinked. Then each cowlink is mmap()'d, one per process.

2. At this point both mappings share the same pages in RAM.

3. Then one of the cowlinks is written to...

-- Jamie
Erik Andersen
2004-04-03 01:21:12 UTC
Permalink
Post by Jamie Lokier
1. A file is cowlinked. Then each cowlink is mmap()'d, one per process.
2. At this point both mappings share the same pages in RAM.
3. Then one of the cowlinks is written to...
Using mmap with PROT_WRITE on a cowlink must preemptively
break the link.

-Erik

--
Erik B. Andersen http://codepoet-consulting.com/
--This message was written using 73% post-consumer electrons--
Jamie Lokier
2004-04-03 01:59:20 UTC
Permalink
Post by Erik Andersen
Post by Jamie Lokier
1. A file is cowlinked. Then each cowlink is mmap()'d, one per process.
2. At this point both mappings share the same pages in RAM.
3. Then one of the cowlinks is written to...
Using mmap with PROT_WRITE on a cowlink must preemptively
break the link.
I forget to mention, they are PROT_READ shared mappings.

-- Jamie
Ross Biro
2004-04-03 03:55:08 UTC
Permalink
Post by Jamie Lokier
Post by Erik Andersen
Post by Jamie Lokier
1. A file is cowlinked. Then each cowlink is mmap()'d, one per process.
2. At this point both mappings share the same pages in RAM.
3. Then one of the cowlinks is written to...
Using mmap with PROT_WRITE on a cowlink must preemptively
break the link.
I forget to mention, they are PROT_READ shared mappings.
Or just also make the page cow and not break the link until someone
actually writes to it. Really depends on how you view the performance
impact, whether you do cow at the block of inode level, and how much
you want to touch (or copy parts of) the mm.
Pavel Machek
2004-04-03 09:09:20 UTC
Permalink
Hi!
Post by Jamie Lokier
Post by Erik Andersen
Post by Jamie Lokier
1. A file is cowlinked. Then each cowlink is mmap()'d, one per process.
2. At this point both mappings share the same pages in RAM.
3. Then one of the cowlinks is written to...
Using mmap with PROT_WRITE on a cowlink must preemptively
break the link.
I forget to mention, they are PROT_READ shared mappings.
I'm not mm guru, but... with rmap, we should be able to find all the
users of that shared memory, and unmap their pages, right?

Until copy is done, we don't do anything, because write is not allowed
to progress until copy is done. When copy is done we should unmap all
the pages that still point to "old" copy, let write progress, and make
users fault in.
Pavel
--
When do you have a heart between your knees?
[Johanka's followup: and *two* hearts?]
Jamie Lokier
2004-04-03 13:27:25 UTC
Permalink
Post by Pavel Machek
Post by Jamie Lokier
Post by Erik Andersen
Post by Jamie Lokier
1. A file is cowlinked. Then each cowlink is mmap()'d, one per process.
2. At this point both mappings share the same pages in RAM.
3. Then one of the cowlinks is written to...
Using mmap with PROT_WRITE on a cowlink must preemptively
break the link.
I forget to mention, they are PROT_READ shared mappings.
I'm not mm guru, but... with rmap, we should be able to find all the
users of that shared memory, and unmap their pages, right?
Yes. I bring it up only because it's tricky, and the simple cowlink
implementations so far don't deal with it.

A page can only exist in one address_space. So if pages are shared
before the cow is broken, the address_space must be of the shared
cowid object, not an individual address_space per cowlink.
Afterwards, the copied pages are in the non-shared cowlink object's
address_space.
Post by Pavel Machek
Until copy is done, we don't do anything, because write is not allowed
to progress until copy is done. When copy is done we should unmap all
the pages that still point to "old" copy, let write progress, and make
users fault in.
I agree.

(Ross suggested using COW pages. While technically possible, that
would be pretty complicated to implemented as it implies pages shared
among more than one address_space, and the facility for write() to
break COW sharing in the page cache, and update page tables when that
happens.)

-- Jamie
Eric W. Biederman
2004-04-03 18:39:53 UTC
Permalink
Post by Jamie Lokier
1. A file is cowlinked. Then each cowlink is mmap()'d, one per process.
2. At this point both mappings share the same pages in RAM.
Why they have different inodes?
Post by Jamie Lokier
3. Then one of the cowlinks is written to...
I would not worry about sharing page cache entries unless this becomes
a common case. If you want to avoid the hit of rereading the file when
you have a cow copy it should be simple enough to walk through the list
of cow copies and see if anyone else has it open.

Eric
Jamie Lokier
2004-04-03 19:43:44 UTC
Permalink
Post by Eric W. Biederman
Post by Jamie Lokier
1. A file is cowlinked. Then each cowlink is mmap()'d, one per process.
2. At this point both mappings share the same pages in RAM.
Why they have different inodes?
Did you miss the last 20 or so messages in this thread?

We'd like cowlinks that are an invisible filesystem optimisation.
That means you "copy" a file and it behaves the same as if you copy a file.
Post by Eric W. Biederman
Post by Jamie Lokier
3. Then one of the cowlinks is written to...
I would not worry about sharing page cache entries unless this becomes
a common case. If you want to avoid the hit of rereading the file when
you have a cow copy it should be simple enough to walk through the list
of cow copies and see if anyone else has it open.
It is not a question of performance, it's correctness. Either you
have cowlinks that behave like copied files, or you accept that when
cowlinked files are mmapped and written to, they don't behave like
regular files (not even the original file prior to cowlinking does).

Btw, I'm not suggesting sharing page cache entries.

-- Jamie
Eric W. Biederman
2004-04-03 20:30:24 UTC
Permalink
Post by Jamie Lokier
Post by Eric W. Biederman
Post by Jamie Lokier
1. A file is cowlinked. Then each cowlink is mmap()'d, one per process.
2. At this point both mappings share the same pages in RAM.
Why they have different inodes?
Did you miss the last 20 or so messages in this thread?
We'd like cowlinks that are an invisible filesystem optimisation.
That means you "copy" a file and it behaves the same as if you copy a file.
Exactly so they would not share the same pages in RAM.
Post by Jamie Lokier
Post by Eric W. Biederman
Post by Jamie Lokier
3. Then one of the cowlinks is written to...
I would not worry about sharing page cache entries unless this becomes
a common case. If you want to avoid the hit of rereading the file when
you have a cow copy it should be simple enough to walk through the list
of cow copies and see if anyone else has it open.
It is not a question of performance, it's correctness. Either you
have cowlinks that behave like copied files, or you accept that when
cowlinked files are mmapped and written to, they don't behave like
regular files (not even the original file prior to cowlinking does).
Btw, I'm not suggesting sharing page cache entries.
It sounded like you assumed sharing of page cache entries above.
How do you get to step 2 if the cow copies don't share the same page
cache entries?

Eric
Jamie Lokier
2004-04-03 21:59:41 UTC
Permalink
Post by Eric W. Biederman
Post by Jamie Lokier
We'd like cowlinks that are an invisible filesystem optimisation.
That means you "copy" a file and it behaves the same as if you copy a file.
Exactly so they would not share the same pages in RAM.
That is one way to implement it.
Post by Eric W. Biederman
Post by Jamie Lokier
Btw, I'm not suggesting sharing page cache entries.
It sounded like you assumed sharing of page cache entries above.
How do you get to step 2 if the cow copies don't share the same page
cache entries?
Ah. A misunderstanding on my part.

I mean not sharing page cache entries between different
address_spaces, but sharing between different cowlinks which use the
same underlying address_space.

I had in mind that since each cowlink is a separate inode, but both
inodes point to a shared data structure in the filesystem, they would
map pages out of a shared address_space representing that data
structure. You've pointed out that it isn't necessary to do that, and
it's probably simpler not to.

Now I see your point. Page sharing could be avoided completely, if
when mapping a cowlink the page was _copied_ from the shared
address_space to the cowlink's own address_space. Copying also solves
the mlock() problem. (A shared address_space is still required, because
you may cowlink a file which has dirty pages in RAM).

Copying raises a different problem: what to do when a non-cowlink file
is mapped (PROT_READ), and then it's cowlinked while the mapping is in
place. The non-cowlink inode gets converted to a cowlink inode. The
pages are hashed in the original address_space, and you now have a
mapping of a cowlink file where the mapped pages are _not copies_ of
pages in the shared address_space.

-- Jamie
Eric W. Biederman
2004-04-04 08:15:50 UTC
Permalink
Post by Jamie Lokier
Post by Eric W. Biederman
Post by Jamie Lokier
We'd like cowlinks that are an invisible filesystem optimisation.
That means you "copy" a file and it behaves the same as if you copy a file.
Exactly so they would not share the same pages in RAM.
That is one way to implement it.
Post by Eric W. Biederman
Post by Jamie Lokier
Btw, I'm not suggesting sharing page cache entries.
It sounded like you assumed sharing of page cache entries above.
How do you get to step 2 if the cow copies don't share the same page
cache entries?
Ah. A misunderstanding on my part.
I mean not sharing page cache entries between different
address_spaces, but sharing between different cowlinks which use the
same underlying address_space.
I had in mind that since each cowlink is a separate inode, but both
inodes point to a shared data structure in the filesystem, they would
map pages out of a shared address_space representing that data
structure. You've pointed out that it isn't necessary to do that, and
it's probably simpler not to.
Especially since the VM layer has no concept of COW except for private
anonymous pages. Which does not map to a POSIX cow on files.
Post by Jamie Lokier
Now I see your point. Page sharing could be avoided completely, if
when mapping a cowlink the page was _copied_ from the shared
address_space to the cowlink's own address_space. Copying also solves
the mlock() problem. (A shared address_space is still required, because
you may cowlink a file which has dirty pages in RAM).
Either that your you call fsync(file) as part of generating the cow
copy.
Post by Jamie Lokier
Copying raises a different problem: what to do when a non-cowlink file
is mapped (PROT_READ), and then it's cowlinked while the mapping is in
place. The non-cowlink inode gets converted to a cowlink inode. The
pages are hashed in the original address_space, and you now have a
mapping of a cowlink file where the mapped pages are _not copies_ of
pages in the shared address_space.
If you don't flush things to disk first there are certainly some sharing
issues. Flushing the data to the address_space of the shared inode
should work just as well though.

What you must have is a clear state change from caching a normal file
to cache a cow file. Where certain parts of the behavior changes.

What this needs to do depends on the VFS at the time.

If sharing is introduced past that state change things get tricky to
manage. I just don't want to think about those issues just now...

Eric
Jörn Engel
2004-04-05 08:35:49 UTC
Permalink
Post by Jamie Lokier
=20
Btw, I'm not suggesting sharing page cache entries.
But I am!!!

Sharing the page cache is more important to me than sharing disk
space. Disk space is cheap, but increasing memory beyond 1GiB in my
notebook is not and 1GiB is too little, so memory is the real
constraint.

And it looks like Pavel already found the solution. Whenever doing
something fishy that would confuse the page cache, we
1. lock
2. invalidate page cache for all files belonging to that cow entity
3. copyfile(), write(), or whatever
4. unlock

This is always possibly, because page cache for cow-files is never
read-write. If it was, we would have done 1-4 before and now have a
regular (non-cow) file.

Did I miss something?

J=F6rn

--=20
Premature optimization is the root of all evil.
-- Donald Knuth
Eric W. Biederman
2004-04-05 09:15:01 UTC
Permalink
Post by Jörn Engel
Post by Jamie Lokier
=20
Btw, I'm not suggesting sharing page cache entries.
=20
But I am!!!
=20
Sharing the page cache is more important to me than sharing disk
space. Disk space is cheap, but increasing memory beyond 1GiB in my
notebook is not and 1GiB is too little, so memory is the real
constraint.
And it looks like Pavel already found the solution. Whenever doing
something fishy that would confuse the page cache, we
1. lock
2. invalidate page cache for all files belonging to that cow entity
3. copyfile(), write(), or whatever
4. unlock
=20
This is always possibly, because page cache for cow-files is never
read-write. If it was, we would have done 1-4 before and now have a
regular (non-cow) file.
=20
Did I miss something?
I know a writable mmap needs to trigger a copy in that case.
And then are fun cases with MAP_FIXED which may mean invalidation
is not allowed.

As scheme that does not isolate the invalidate to the new copy worries =
me.

Eric
Jörn Engel
2004-04-05 09:18:48 UTC
Permalink
Post by Eric W. Biederman
=20
I know a writable mmap needs to trigger a copy in that case.
And then are fun cases with MAP_FIXED which may mean invalidation
is not allowed.
Sounds fishy, so it will trigger cow. Done. :)
Post by Eric W. Biederman
As scheme that does not isolate the invalidate to the new copy worrie=
s me.

If you can come up with a better way, please do. Right now I cannot
think of anything better, but Pavel already showed how little that
means.

J=F6rn

--=20
The wise man seeks everything in himself; the ignorant man tries to get
everything from somebody else.
-- unknown
Pavel Machek
2004-04-05 11:43:12 UTC
Permalink
Hi!
Post by Eric W. Biederman
Post by Jörn Engel
And it looks like Pavel already found the solution. Whenever doing
something fishy that would confuse the page cache, we
1. lock
2. invalidate page cache for all files belonging to that cow entity
3. copyfile(), write(), or whatever
4. unlock
This is always possibly, because page cache for cow-files is never
read-write. If it was, we would have done 1-4 before and now have a
regular (non-cow) file.
Did I miss something?
I know a writable mmap needs to trigger a copy in that case.
And then are fun cases with MAP_FIXED which may mean invalidation
is not allowed.
How is "invalidation not allowed" for MAP_FIXED? Application will
never see the fault...

Pavel
--
Horseback riding is like software...
...vgf orggre jura vgf serr.
Jamie Lokier
2004-04-05 12:17:53 UTC
Permalink
Post by Pavel Machek
Post by Eric W. Biederman
I know a writable mmap needs to trigger a copy in that case.
And then are fun cases with MAP_FIXED which may mean invalidation
is not allowed.
How is "invalidation not allowed" for MAP_FIXED? Application will
never see the fault...
I think Eric secretly meant MAP_LOCKED and/or mlock().

Even if the file isn't copied, when you have two different mappings of
different cowlinks both mapped with MAP_LOCKED, the pages need to be
different in RAM or we break locked page expectations.

-- Jamie
Jamie Lokier
2004-04-05 12:39:33 UTC
Permalink
As scheme that does not isolate the invalidate to the new copy worries me.
It is possible to isolate the invalidate to mappings of the newly
broken cowlink only.

There is still the problem of MAP_LOCKED, or more realistically
mlockall() used with mappings of glibc etc. on a filesystem snapshot
made using cowlinks. The easiest thing is obviously to break the
cowlink when a mapping is locked, but it's not nice.

-- Jamie
Jamie Lokier
2004-04-05 12:41:58 UTC
Permalink
Post by Jörn Engel
Sharing the page cache is more important to me than sharing disk
space. Disk space is cheap, but increasing memory beyond 1GiB in my
notebook is not and 1GiB is too little, so memory is the real
constraint.
Lucky you! I have 192MB, that's as much as it can take.
Post by Jörn Engel
And it looks like Pavel already found the solution. Whenever doing
something fishy that would confuse the page cache, we
1. lock
2. invalidate page cache for all files belonging to that cow entity
3. copyfile(), write(), or whatever
4. unlock
=20
This is always possibly, because page cache for cow-files is never
read-write. If it was, we would have done 1-4 before and now have a
regular (non-cow) file.
=20
Did I miss something?
Just some interesting indirection or substitution of address_space
objects needed in the vmas, to map the right pages.

-- Jamie
Jörn Engel
2004-04-05 18:03:01 UTC
Permalink
Post by Jamie Lokier
Post by Jörn Engel
Sharing the page cache is more important to me than sharing disk
space. Disk space is cheap, but increasing memory beyond 1GiB in m=
y
Post by Jamie Lokier
Post by Jörn Engel
notebook is not and 1GiB is too little, so memory is the real
constraint.
=20
Lucky you! I have 192MB, that's as much as it can take.
I know your pain, last month my machine had just 160MiB.

That month I also didn't care much about page cache awareness of cow.
A grep over one kernel tree would thrash the cache just as a grep over
ten. Now I can tell the difference again.
Post by Jamie Lokier
Post by Jörn Engel
And it looks like Pavel already found the solution. Whenever doing
something fishy that would confuse the page cache, we
1. lock
2. invalidate page cache for all files belonging to that cow entity
3. copyfile(), write(), or whatever
4. unlock
=20
This is always possibly, because page cache for cow-files is never
read-write. If it was, we would have done 1-4 before and now have =
a
Post by Jamie Lokier
Post by Jörn Engel
regular (non-cow) file.
=20
Did I miss something?
=20
Just some interesting indirection or substitution of address_space
objects needed in the vmas, to map the right pages.
That is already covered by the nonexistent invalidate_inode() function
in point 2. :)

You are right, implementation is more interesting, but it *should*
work.

J=F6rn

--=20
To announce that there must be no criticism of the President, or that w=
e
are to stand by the President, right or wrong, is not only unpatriotic
and servile, but is morally treasonable to the American public.
-- Theodore Roosevelt, Kansas City Star, 1918

j***@unity.ncsu.edu
2004-04-05 11:10:33 UTC
Permalink
Post by Pavel Machek
Post by Jamie Lokier
get_data_id() is one way to detect equivalent files. Another would be
a function files_equal(fd1, fd2) which returns a boolean.
files_equal(...) would lead to quadratic number of calls, no?
Post by Jamie Lokier
get_data_id() has the advantage that it can report immediately whether
a file has _any_ cowlink peers, which is important for programs that
scan trees. Perhaps getxattr() would be reasonable interface, using a
named attribute "data-id".
Yes, get_data_id() is extremely ugly name.
I think it is worth asking if we really want to give userspace a way of
doing this or not. It exposes fairly low level FS details to userspace,
and this will limit our ability to change the implementation of the FS
in the future (partially shared files?). Certainly there has been some
pain caused over the years because userspace can ask for the inode number,
and people have written file systems which do not use inodes. Then they
have to kluge around this and make something up. I would hate to see
us implement an interface that causes long term pain.

I also cant really think of anyone who would need this information. I have
seen diff and tar used as examples. Perhaps diff would run faster but that
seems like a very special case thing, and diff will certainly work w/o it.
Tar might also be faster creating archives if it had this information
available. However to make tar useful wrt cowlinks, it will need to be
able to create these links at extract time from tarfiles which were created
on non-cowlink filesystems, so I don't think there is a pressing need.

Of course, I am willing to believe that I am wrong. Hope you all have a
great day.

Thanks,

Jim
--
www.jeweltran.com
Pavel Machek
2004-04-05 11:46:37 UTC
Permalink
Hi!
Post by j***@unity.ncsu.edu
Post by Pavel Machek
Post by Jamie Lokier
get_data_id() is one way to detect equivalent files. Another would be
a function files_equal(fd1, fd2) which returns a boolean.
files_equal(...) would lead to quadratic number of calls, no?
Post by Jamie Lokier
get_data_id() has the advantage that it can report immediately whether
a file has _any_ cowlink peers, which is important for programs that
scan trees. Perhaps getxattr() would be reasonable interface, using a
named attribute "data-id".
Yes, get_data_id() is extremely ugly name.
I think it is worth asking if we really want to give userspace a way of
doing this or not. It exposes fairly low level FS details to userspace,
and this will limit our ability to change the implementation of the FS
in the future (partially shared files?). Certainly there has been some
pain caused over the years because userspace can ask for the inode number,
and people have written file systems which do not use inodes. Then they
have to kluge around this and make something up. I would hate to see
us implement an interface that causes long term pain.
It should not be painfull, get_data_id() can allways return
-Esomething, meaning "I do not know".
Post by j***@unity.ncsu.edu
I also cant really think of anyone who would need this information. I have
seen diff and tar used as examples. Perhaps diff would run faster but that
seems like a very special case thing, and diff will certainly work w/o it.
Speeding up diff was one of main cowlinks motivations.
Pavel
--
Horseback riding is like software...
...vgf orggre jura vgf serr.
Jamie Lokier
2004-04-05 12:35:56 UTC
Permalink
Post by j***@unity.ncsu.edu
Perhaps diff would run faster but that
seems like a very special case thing, and diff will certainly work w/o it.
We are talking about a difference between 20 minutes and 1 second.
It's quite significant, when it's a regular part of your diffing &
patching day.

I agree with your general sentiment that we shouldn't expose
filesystem details, e.g. a 32-bit integer. See below for an
alternative interface.
Post by j***@unity.ncsu.edu
Tar might also be faster creating archives if it had this information
available. However to make tar useful wrt cowlinks, it will need to be
able to create these links at extract time from tarfiles which were created
on non-cowlink filesystems, so I don't think there is a pressing need.
I agree. The purpose of cowlinks is to be semantically invisible. If tar
or some other archiver/transferer wanted to use this information, it
should really be checking for equivalent files in general (like cmp)
and use this call as an optimisation only.

Btw, when we treat cowlinks as a semantically invisible, there is no
problem searching an entire filesystem for files with identical
content and linking them together to save space, in a cron job. It's
invisible to applications, except that space is saved and sometimes
the first write takes longer.

That still permits the get_data_id() optimisation, but that now
strictly means "kernel knows and returns a unique id of the data
(unique in this filesystem)".

Instead of get_data_id(), we'd use a POSIX attribute called "data-id"
returned by getxattr(). An absence of the attribute indicates that no
data-id is known. Otherwise, it's a unique id for that data in the
current filesystem.

It's a short byte string (another reason for making it a POSIX
attribute). On ext2/ext3, it's just the bytes of the shared inode
number plus a filesystem-wide generation number. On a hypothetical
httpfs, it could be the host name and ETag (a strong validator). On
any filesystem, it could be the SHA1 digest if that is known. It
would have the nice property of working over NFSv4, too.

-- Jamie
Jörn Engel
2004-04-05 08:43:28 UTC
Permalink
=20
Post by Jörn Engel
Then you link()...
=20
INODE123 Usage count =3D 2, pointer to cowid 567
COWID 567: Usage count =3D 3
INODE124 Usage count =3D 2, pointer to cowid 567
INODE125 Usage count =3D 2, pointer to cowid 567
=20
Now, if I write to any inode with has cowid, data have to be copied,
and pointer to cowid deleted from that inode .
Ok, you win. Next time I get scare, I should ask you first. :)

In a single picture, links currently look like this:

Symlink can point to inodes or cowlinks or hardlinks
Hardlink can point to inodes or cowlinks
Cowlink can point to inodes

I like it.

Not sure about the current count, but it looks like most people favor
the indirect approach now.

J=F6rn

--=20
"Security vulnerabilities are here to stay."
-- Scott Culp, Manager of the Microsoft Security Response Center, 2001
Eric W. Biederman
2004-04-03 19:47:58 UTC
Permalink
Post by Pavel Machek
=20
I think they *should* have separate permissions.
=20
That makes the count 2:2. I'll continue to follow the simple solutio=
n
for some time, but wouldn't like to have it included for now (or ever=
?)
=20
Post by Pavel Machek
Also it should be possible to have file with 2 hardlinks cowlinked
somewhere, and possibly make more hardlinks of that one... Having
pointer to another inode in place where direct block pointers norma=
lly
Post by Pavel Machek
are should be enough (thinking ext2 here).
=20
All right, you are proposing hell. I've tried to think through all
possibilities and was too scared to continue. So limitation is that
cowlinks and hardlinks are mutually exclusive, which eliminated all
problems.
Sounds like a simple place to start.
If you really want cowlinks and hardlinks to be intermixed freely, I'=
d
happily agree with you as soon as you can define the behaviour for al=
l
possible cases in a simple document and none of them make me scared
again. Show me that it is possible and makes sense.
Concise summary:
- (indirection inodes) solve the hard link problem.
- Don't share the page cache between cow copies.

=46or intermixing hard links and cow copy operations a single extra lay=
er
of indirection solves the problem. The cow files point to an (indirect=
ion)
inode that holds the hard link count, the real inode number and possibl=
y=20
a few extra things like permissions. Except for never being pointed
at by a directory entry the real inode works like normal. The link
count of the real inode is the number of indirection inodes pointing
at it.

Caching issues are more subtle. The pathological case is a read only
mapping of the cow file, that expects to see the mapping changed by
some other process writing to the file. Not sharing page cache
entries between cow copies solves this problem.

Once page cache entries are distinct it is possible to move the
trigger down from an open with intent to write, to the first
actual write operation. In aops->prepare_write from sys_write,
and aops->writepage from the mmap version.

And a few scenarios to hopefully make things clear.

So your fs starts out as:

/file1 -> ino1 (link count 2) -> data block #1
/file2 -> ino1 (link count 2) -> data block #1

file1 is only 4K long, so I only need to describe one data block.

Actions:

cowcopy(file2, file3):

/file1 -> ino1 (link count 2) -> ino2 (link count 2) -> data block #1
/file2 -> ino1 (link count 2) -> ino2 (link count 2) -> data block #1
/file3 -> ino3 (link count 1) -> ino2 (link count 2) -> data block #1


copyfile(file3, file4):

/file1 -> ino1 (link count 2) -> ino2 (link count 3) -> data block #1
/file2 -> ino1 (link count 2) -> ino2 (link count 3) -> data block #1
/file3 -> ino3 (link count 1) -> ino2 (link count 3) -> data block #1
/file4 -> ino4 (link count 1) -> ino2 (link count 3) -> data block #1

unlink(file2):

/file1 -> ino1 (link count 1) -> ino2 (link count 3) -> data block #1
/file3 -> ino3 (link count 1) -> ino2 (link count 3) -> data block #1
/file4 -> ino4 (link count 1) -> ino2 (link count 3) -> data block #1

link(file4, file5):

/file1 -> ino1 (link count 1) -> ino2 (link count 3) -> data block #1
/file3 -> ino3 (link count 1) -> ino2 (link count 3) -> data block #1
/file4 -> ino4 (link count 2) -> ino2 (link count 3) -> data block #1
/file5 -> ino4 (link count 2) -> ino2 (link count 3) -> data block #1

write(file3):

/file1 -> ino1 (link count 1) -> ino2 (link count 2) -> data block #1
/file3 -> ino3 (link count 1) -> data block #2
/file4 -> ino4 (link count 2) -> ino2 (link count 2) -> data block #1
/file5 -> ino4 (link count 2) -> ino2 (link count 2) -> data block #1

write(file5):

/file1 -> ino1 (link count 1) -> ino2 (link count 1) -> data block #1
/file3 -> ino3 (link count 1) -> data block #2
/file4 -> ino4 (link count 2) -> data block #3
/file5 -> ino4 (link count 2) -> data block #3

write(file1):

/file1 -> ino1 (link count 1) -> data block #1 (with modified contents)
/file3 -> ino3 (link count 1) -> data block #2
/file4 -> ino4 (link count 2) -> data block #3
/file5 -> ino4 (link count 2) -> data block #3


Does this make things clear?

Eric
Jörn Engel
2004-04-05 08:54:21 UTC
Permalink
Post by Eric W. Biederman
=20
And a few scenarios to hopefully make things clear.
=20
=20
/file1 -> ino1 (link count 2) -> data block #1
/file2 -> ino1 (link count 2) -> data block #1
=20
file1 is only 4K long, so I only need to describe one data block.
=20
=20
=20
/file1 -> ino1 (link count 2) -> ino2 (link count 2) -> data block #1
/file2 -> ino1 (link count 2) -> ino2 (link count 2) -> data block #1
/file3 -> ino3 (link count 1) -> ino2 (link count 2) -> data block #1
=20
=20
=20
/file1 -> ino1 (link count 2) -> ino2 (link count 3) -> data block #1
/file2 -> ino1 (link count 2) -> ino2 (link count 3) -> data block #1
/file3 -> ino3 (link count 1) -> ino2 (link count 3) -> data block #1
/file4 -> ino4 (link count 1) -> ino2 (link count 3) -> data block #1
=20
=20
/file1 -> ino1 (link count 1) -> ino2 (link count 3) -> data block #1
/file3 -> ino3 (link count 1) -> ino2 (link count 3) -> data block #1
/file4 -> ino4 (link count 1) -> ino2 (link count 3) -> data block #1
=20
=20
/file1 -> ino1 (link count 1) -> ino2 (link count 3) -> data block #1
/file3 -> ino3 (link count 1) -> ino2 (link count 3) -> data block #1
/file4 -> ino4 (link count 2) -> ino2 (link count 3) -> data block #1
/file5 -> ino4 (link count 2) -> ino2 (link count 3) -> data block #1
=20
=20
/file1 -> ino1 (link count 1) -> ino2 (link count 2) -> data block #1
/file3 -> ino3 (link count 1) -> data block #2
/file4 -> ino4 (link count 2) -> ino2 (link count 2) -> data block #1
/file5 -> ino4 (link count 2) -> ino2 (link count 2) -> data block #1
=20
=20
/file1 -> ino1 (link count 1) -> ino2 (link count 1) -> data block #1
/file3 -> ino3 (link count 1) -> data block #2
/file4 -> ino4 (link count 2) -> data block #3
/file5 -> ino4 (link count 2) -> data block #3
=20
=20
/file1 -> ino1 (link count 1) -> data block #1 (with modified content=
s)
Post by Eric W. Biederman
/file3 -> ino3 (link count 1) -> data block #2
/file4 -> ino4 (link count 2) -> data block #3
/file5 -> ino4 (link count 2) -> data block #3
=20
Does this make things clear?
Almost. What exactly do cowcopy() and copyfile() do? They look the
same, so why the difference?

J=F6rn

--=20
The story so far:
In the beginning the Universe was created. This has made a lot
of people very angry and been widely regarded as a bad move.
-- Douglas Adams?
Eric W. Biederman
2004-04-05 09:07:29 UTC
Permalink
Post by Jörn Engel
Almost. What exactly do cowcopy() and copyfile() do? They look the
same, so why the difference?
A typo on my part from the looks of it. I mean the same operation.

Eric
Davide Libenzi
2004-03-20 16:48:43 UTC
Permalink
Post by Patrick J. LoPresti
What happens if the disk fills while you are making the copy? Will
open(2) on an *existing file* then return ENOSPC?
I do not think you can implement this without changing the interface
to open(2). Which means applications have to be made aware of it
anyway. Which means you might as well leave your implementation as-is
and let userspace worry about creating the copy (and dealing with the
resulting errors).
FWIW I did this quite some time ago to speed up copy+diff linux kernel
trees:

http://www.xmailserver.org/flcow.html

It is entirely userspace and uses LD_PRELOAD on my dev shell.



- Davide
Jörn Engel
2004-03-21 12:57:30 UTC
Permalink
=20
Post by Patrick J. LoPresti
What happens if the disk fills while you are making the copy? Will
open(2) on an *existing file* then return ENOSPC?
=20
I do not think you can implement this without changing the interfac=
e
Post by Patrick J. LoPresti
to open(2). Which means applications have to be made aware of it
anyway. Which means you might as well leave your implementation as=
-is
Post by Patrick J. LoPresti
and let userspace worry about creating the copy (and dealing with t=
he
Post by Patrick J. LoPresti
resulting errors).
=20
FWIW I did this quite some time ago to speed up copy+diff linux kerne=
l=20
=20
http://www.xmailserver.org/flcow.html
=20
It is entirely userspace and uses LD_PRELOAD on my dev shell.
Nice work. I was thinking about something like that as an
intermediate solution (my goal is libc inclusion), just with slightly
different checks:

int ret =3D open(...);
if (ret =3D=3D -EMLINK)
ret =3D cow_open(...);
return ret;

J=F6rn

--=20
He that composes himself is wiser than he that composes a book.
-- B. Franklin
Davide Libenzi
2004-03-21 17:59:39 UTC
Permalink
Post by Jörn Engel
=20
FWIW I did this quite some time ago to speed up copy+diff linux ker=
nel=20
Post by Jörn Engel
=20
http://www.xmailserver.org/flcow.html
=20
It is entirely userspace and uses LD_PRELOAD on my dev shell.
=20
Nice work. I was thinking about something like that as an
intermediate solution (my goal is libc inclusion), just with slightly
=20
int ret =3D open(...);
if (ret =3D=3D -EMLINK)
ret =3D cow_open(...);
return ret;
When I did that, fumes of an in-kernel implementation invaded my head f=
or=20
a little while. Then you start thinking that you have to teach apps of =
new=20
open(2) semantics, you have to bloat kernel code a little bit and you h=
ave=20
to deal with a new set of errors cases that open(2) is not expected to=20
deal with. A fully userspace implementation did fit my needs at that ti=
me,=20
even if the LD_PRELOAD trick might break if weak aliases setup for open=
=20
functions change inside glibc.



- Davide
Jörn Engel
2004-03-21 18:14:30 UTC
Permalink
=20
When I did that, fumes of an in-kernel implementation invaded my head=
for=20
a little while. Then you start thinking that you have to teach apps o=
f new=20
open(2) semantics, you have to bloat kernel code a little bit and you=
have=20
to deal with a new set of errors cases that open(2) is not expected t=
o=20
deal with. A fully userspace implementation did fit my needs at that =
time,=20
even if the LD_PRELOAD trick might break if weak aliases setup for op=
en=20
functions change inside glibc.
209 fairly simple lines definitely have more appear than a full
in-kernel implementation with many new corner-cases, yes. But it
looks as if you ignore the -ENOSPC case, so you cheated a little. ;)

No matter how you try, there is no way around an additional return
code for open(), so we have to break compatibility anyway. The good
news is that a) people not using this feature won't notice and b) all
programs I tried so far can deal with the problem. Vim even has a
decent error message - as if my patch was anticipated already.

J=F6rn

--=20
Everything should be made as simple as possible, but not simpler.
-- Albert Einstein
Davide Libenzi
2004-03-21 20:26:57 UTC
Permalink
Post by Jörn Engel
=20
When I did that, fumes of an in-kernel implementation invaded my he=
ad for=20
Post by Jörn Engel
a little while. Then you start thinking that you have to teach apps=
of new=20
Post by Jörn Engel
open(2) semantics, you have to bloat kernel code a little bit and y=
ou have=20
Post by Jörn Engel
to deal with a new set of errors cases that open(2) is not expected=
to=20
Post by Jörn Engel
deal with. A fully userspace implementation did fit my needs at tha=
t time,=20
Post by Jörn Engel
even if the LD_PRELOAD trick might break if weak aliases setup for =
open=20
Post by Jörn Engel
functions change inside glibc.
=20
209 fairly simple lines definitely have more appear than a full
in-kernel implementation with many new corner-cases, yes. But it
looks as if you ignore the -ENOSPC case, so you cheated a little. ;)
Yes, at that time I preferred to fall back to the caller open(2) if any=
=20
error happened during the COW (a more picky implementation would simply=
=20
bounce an error to the application). Look BTW that there is a differenc=
e=20
between the error handling when you have an in-kernel solution or a=20
completely userspace solution. If you push this inside the kernel you h=
ave=20
at least to define a new open(2) flag and a new set of errors that migh=
t=20
arise when doing the COW. You are basically changing the open(2) API. Y=
ou=20
have also to "teach" apps to use the new flag, since obviously you cann=
ot=20
make COW a default behavior. The fl-cow approach let you define a set o=
f=20
paths where you want COW to happen (in my case typically /usr/src/lk), =
and=20
only apps writing to hard-linked files inside such path gets COWed. The=
=20
open(2) API does not change. OTOH there is the LD_PRELOAD trick that is=
=20
weak alias dependent.



- Davide
Jörn Engel
2004-03-21 20:35:59 UTC
Permalink
=20
Yes, at that time I preferred to fall back to the caller open(2) if a=
ny=20
error happened during the COW (a more picky implementation would simp=
ly=20
bounce an error to the application). Look BTW that there is a differe=
nce=20
between the error handling when you have an in-kernel solution or a=20
completely userspace solution. If you push this inside the kernel you=
have=20
at least to define a new open(2) flag and a new set of errors that mi=
ght=20
arise when doing the COW. You are basically changing the open(2) API.=
You=20
have also to "teach" apps to use the new flag, since obviously you ca=
nnot=20
make COW a default behavior. The fl-cow approach let you define a set=
of=20
paths where you want COW to happen (in my case typically /usr/src/lk)=
, and=20
only apps writing to hard-linked files inside such path gets COWed. T=
he=20
open(2) API does not change. OTOH there is the LD_PRELOAD trick that =
is=20
weak alias dependent.
My "solution" to this paradox (leaving the interface unchanged, even
though cow is impossible without changes) is a new flag to files. The
user has to manually set the flag and is now in charge of anything
that might break. Scared? Don't set the flag.

It is a compromise, just like yours. Imo it is a little nicer, but
there just isn't any good solution anymore, 1967 would have been the
right time for that.

J=F6rn

--=20
To announce that there must be no criticism of the President, or that w=
e
are to stand by the President, right or wrong, is not only unpatriotic
and servile, but is morally treasonable to the American public.
-- Theodore Roosevelt, Kansas City Star, 1918
Eric W. Biederman
2004-03-22 00:18:41 UTC
Permalink
Post by Jörn Engel
=20
When I did that, fumes of an in-kernel implementation invaded my he=
ad for=20
Post by Jörn Engel
a little while. Then you start thinking that you have to teach apps=
of new=20
Post by Jörn Engel
open(2) semantics, you have to bloat kernel code a little bit and y=
ou have=20
Post by Jörn Engel
to deal with a new set of errors cases that open(2) is not expected=
to=20
Post by Jörn Engel
deal with. A fully userspace implementation did fit my needs at tha=
t time,=20
Post by Jörn Engel
even if the LD_PRELOAD trick might break if weak aliases setup for =
open=20
Post by Jörn Engel
functions change inside glibc.
=20
209 fairly simple lines definitely have more appear than a full
in-kernel implementation with many new corner-cases, yes. But it
looks as if you ignore the -ENOSPC case, so you cheated a little. ;)
=20
No matter how you try, there is no way around an additional return
code for open(), so we have to break compatibility anyway. The good
news is that a) people not using this feature won't notice and b) all
programs I tried so far can deal with the problem. Vim even has a
decent error message - as if my patch was anticipated already.
Actually there is... You don't do the copy until an actual write occur=
s.
Some files are opened read/write when there is simply the chance they m=
ight
be written to so delaying the copy is generally a win.

A coworker of mine implemented a version of this idea as a filesystem.
It did the copy in the kernel, it handled directories, and could be
used to atomically snapshot your filesystem. The only case that was
still a little sketchy was how do you handle cow to a file with hard
links.=20

The interesting case for us is when you have multiple machines sharing
the same root filesystem.

Eric
Davide Libenzi
2004-03-22 00:25:31 UTC
Permalink
=20
Post by Jörn Engel
=20
When I did that, fumes of an in-kernel implementation invaded my =
head for=20
Post by Jörn Engel
a little while. Then you start thinking that you have to teach ap=
ps of new=20
Post by Jörn Engel
open(2) semantics, you have to bloat kernel code a little bit and=
you have=20
Post by Jörn Engel
to deal with a new set of errors cases that open(2) is not expect=
ed to=20
Post by Jörn Engel
deal with. A fully userspace implementation did fit my needs at t=
hat time,=20
Post by Jörn Engel
even if the LD_PRELOAD trick might break if weak aliases setup fo=
r open=20
Post by Jörn Engel
functions change inside glibc.
=20
209 fairly simple lines definitely have more appear than a full
in-kernel implementation with many new corner-cases, yes. But it
looks as if you ignore the -ENOSPC case, so you cheated a little. ;=
)
Post by Jörn Engel
=20
No matter how you try, there is no way around an additional return
code for open(), so we have to break compatibility anyway. The goo=
d
Post by Jörn Engel
news is that a) people not using this feature won't notice and b) a=
ll
Post by Jörn Engel
programs I tried so far can deal with the problem. Vim even has a
decent error message - as if my patch was anticipated already.
=20
Actually there is... You don't do the copy until an actual write occ=
urs.
Some files are opened read/write when there is simply the chance they=
might
be written to so delaying the copy is generally a win.
What about open+mmap?
=09


- Davide
Eric W. Biederman
2004-03-22 05:07:40 UTC
Permalink
Post by Davide Libenzi
=20
=20
Post by Jörn Engel
=20
When I did that, fumes of an in-kernel implementation invaded m=
y head for
Post by Davide Libenzi
=20
Post by Jörn Engel
a little while. Then you start thinking that you have to teach =
apps of new
Post by Davide Libenzi
=20
Post by Jörn Engel
open(2) semantics, you have to bloat kernel code a little bit a=
nd you have
Post by Davide Libenzi
=20
Post by Jörn Engel
to deal with a new set of errors cases that open(2) is not expe=
cted to=20
Post by Davide Libenzi
Post by Jörn Engel
deal with. A fully userspace implementation did fit my needs at=
that time,
Post by Davide Libenzi
=20
Post by Jörn Engel
even if the LD_PRELOAD trick might break if weak aliases setup =
for open=20
Post by Davide Libenzi
Post by Jörn Engel
functions change inside glibc.
=20
209 fairly simple lines definitely have more appear than a full
in-kernel implementation with many new corner-cases, yes. But it
looks as if you ignore the -ENOSPC case, so you cheated a little.=
;)
Post by Davide Libenzi
Post by Jörn Engel
=20
No matter how you try, there is no way around an additional retur=
n
Post by Davide Libenzi
Post by Jörn Engel
code for open(), so we have to break compatibility anyway. The g=
ood
Post by Davide Libenzi
Post by Jörn Engel
news is that a) people not using this feature won't notice and b)=
all
Post by Davide Libenzi
Post by Jörn Engel
programs I tried so far can deal with the problem. Vim even has =
a
Post by Davide Libenzi
Post by Jörn Engel
decent error message - as if my patch was anticipated already.
=20
Actually there is... You don't do the copy until an actual write o=
ccurs.
Post by Davide Libenzi
Some files are opened read/write when there is simply the chance th=
ey might
Post by Davide Libenzi
be written to so delaying the copy is generally a win.
=20
What about open+mmap?
The case is nothing really different from having a hole in your file.

There are two pieces to implementing this. First you create separate
page cache entries for the cow file and it's original, so the
laziness of mmapped file writes will not bite you.. Second you make
aops -> writepage trigger the actual copy of the file, and have it
return -ENOSPC if you can't do the copy.

If cow links became sufficiently common you might want to dig into
the VFS and make it possible to do the copy when a write-fault occurs.
At which point you could share the page cache until you do a copy.

Eric
Davide Libenzi
2004-03-22 05:11:38 UTC
Permalink
Post by Eric W. Biederman
Post by Davide Libenzi
Actually there is... You don't do the copy until an actual write occurs.
Some files are opened read/write when there is simply the chance they might
be written to so delaying the copy is generally a win.
What about open+mmap?
The case is nothing really different from having a hole in your file.
There are two pieces to implementing this. First you create separate
page cache entries for the cow file and it's original, so the
laziness of mmapped file writes will not bite you.. Second you make
aops -> writepage trigger the actual copy of the file, and have it
return -ENOSPC if you can't do the copy.
There has been a misunderstanding. I thought you were talking about a
userspace solution ala fl-cow. Of course if you are inside the kernel you
can catch both explicit writes and page syncs.



- Davide
Eric W. Biederman
2004-03-22 11:20:31 UTC
Permalink
Post by Davide Libenzi
Post by Eric W. Biederman
Post by Davide Libenzi
Actually there is... You don't do the copy until an actual write occurs.
Some files are opened read/write when there is simply the chance they
might
Post by Eric W. Biederman
Post by Davide Libenzi
be written to so delaying the copy is generally a win.
What about open+mmap?
The case is nothing really different from having a hole in your file.
There are two pieces to implementing this. First you create separate
page cache entries for the cow file and it's original, so the
laziness of mmapped file writes will not bite you.. Second you make
aops -> writepage trigger the actual copy of the file, and have it
return -ENOSPC if you can't do the copy.
There has been a misunderstanding. I thought you were talking about a
userspace solution ala fl-cow. Of course if you are inside the kernel you
can catch both explicit writes and page syncs.
Right. Although there is nothing prevent the copy to be in user space
even with the trigger hooks down in the write path.

The nice features of having the hook at that point in the kernel are:
1) No new failure modes for user space to worry about.
2) With cow directory support instant fs level check pointing is achieved.

Eric
Davide Libenzi
2004-03-22 16:02:23 UTC
Permalink
Post by Eric W. Biederman
Post by Davide Libenzi
There has been a misunderstanding. I thought you were talking about a
userspace solution ala fl-cow. Of course if you are inside the kernel you
can catch both explicit writes and page syncs.
Right. Although there is nothing prevent the copy to be in user space
even with the trigger hooks down in the write path.
How do you insert yourself before the first page fault to do the COW, from
userspace (open+mmap)? You can obviously hook mmap(2) and if a PROT_WRITE
is requested, you COW from there. But then you have an whole bunch of new
problems (again, when done from userspace) because, just to begin with,
you need a stateful interception layer (while fl-cow for example is stateless).
In my modest usage scenario for my fl-cow shell (emacs+patch+diff+gcc)
I've found that when something opens in RDWR, it really writes the file at
some point during the opened session. So moving the COW down in the write
path helps little or nothing. There may be as well other use cases where
applications do frequently open in RDWR even w/out ever touching the file.



- Davide
Jamie Lokier
2004-03-25 17:49:42 UTC
Permalink
Actually there is... You don't do the copy until an actual write occurs.
Some files are opened read/write when there is simply the chance they might
be written to so delaying the copy is generally a win.
Programs depend on the inode number returned by fstat() not changing,
and maybe in some other circumstances, even if they subsequently write
to the file.

(It's ok for open() to change the inode number, because that's
equivalent to another program changing the filesystem in parallel).

How do you handle that if COW occurs later than open()?
You could also force COW when fstat() is called, I suppose.

-- Jamie
Eric W. Biederman
2004-03-25 18:06:49 UTC
Permalink
Post by Jamie Lokier
Actually there is... You don't do the copy until an actual write occurs.
Some files are opened read/write when there is simply the chance they might
be written to so delaying the copy is generally a win.
Programs depend on the inode number returned by fstat() not changing,
and maybe in some other circumstances, even if they subsequently write
to the file.
(It's ok for open() to change the inode number, because that's
equivalent to another program changing the filesystem in parallel).
How do you handle that if COW occurs later than open()?
You could also force COW when fstat() is called, I suppose.
One of the rougher patches, in that we don't have persistent inode
numbers. Basically the two files never have the same inode number.
To the user they are always presented as two separate files.
Currently I believe the strategy is to assign an inode when the file
is read into the icache/dcache. I think it is just a sequential
counter.

This was all implemented as a stackable filesystem. Something
that gets down to the real filesystem could likely just reuse
the inode number of the cow link.

Eric
Jamie Lokier
2004-03-25 19:43:03 UTC
Permalink
Post by Eric W. Biederman
One of the rougher patches, in that we don't have persistent inode
numbers. Basically the two files never have the same inode number.
To the user they are always presented as two separate files.
That is not useful for me or the other people who want to use this to
duplicate large source trees and run "diff" between trees.

"diff" depends on being able to check if files in the two trees are
identical -- by checking whether the inode number and device (and
maybe other stat data) are identical. This allows "diff -ur" between
two cloned trees the size of linux to be quite fast. Without that
optimisation, it's very slow indeed.

-- Jamie
Linus Torvalds
2004-03-25 20:38:50 UTC
Permalink
Post by Jamie Lokier
That is not useful for me or the other people who want to use this to
duplicate large source trees and run "diff" between trees.
"diff" depends on being able to check if files in the two trees are
identical -- by checking whether the inode number and device (and
maybe other stat data) are identical. This allows "diff -ur" between
two cloned trees the size of linux to be quite fast. Without that
optimisation, it's very slow indeed.
I think the correct thing to do is to just admit that cowlinks aren't
POSIX, and instead see the inode number as a way to see whether the link
has been broken or not. Ie just accept the inode number potentially
changing.

That would make "diff" (adn most other uses) ok with this, and anythign
that isn't, just couldn't be used with cowlinked files.

Linus
Eric W. Biederman
2004-03-25 22:16:48 UTC
Permalink
Post by Linus Torvalds
Post by Jamie Lokier
That is not useful for me or the other people who want to use this to
duplicate large source trees and run "diff" between trees.
"diff" depends on being able to check if files in the two trees are
identical -- by checking whether the inode number and device (and
maybe other stat data) are identical. This allows "diff -ur" between
two cloned trees the size of linux to be quite fast. Without that
optimisation, it's very slow indeed.
I think the correct thing to do is to just admit that cowlinks aren't
POSIX, and instead see the inode number as a way to see whether the link
has been broken or not. Ie just accept the inode number potentially
changing.
That would make "diff" (adn most other uses) ok with this, and anythign
that isn't, just couldn't be used with cowlinked files.
Really?

tar and cp still need to be taught about them. And if they are not taught
they will do the wrong thing and hard link the files removing the
copy on write semantics. Which would do ugly thing when you restore
from your backup.

I don't have a problem with the inode number changing when you write to
a file, because I can't think of much that would care either way.
But having the inode number of an open file change sounds like a very
difficult problem to track.

Maybe aiming cow links at things like a live cd filesystem is too
ambitious, but it sounds like a minimal clean way to handle all of
the dependencies on writeable files that show up.

Eric
Jörn Engel
2004-04-01 14:53:46 UTC
Permalink
Post by Eric W. Biederman
=20
=20
That is not useful for me or the other people who want to use thi=
s to
Post by Eric W. Biederman
duplicate large source trees and run "diff" between trees.
=20
"diff" depends on being able to check if files in the two trees a=
re
Post by Eric W. Biederman
identical -- by checking whether the inode number and device (and
maybe other stat data) are identical. This allows "diff -ur" bet=
ween
Post by Eric W. Biederman
two cloned trees the size of linux to be quite fast. Without tha=
t
Post by Eric W. Biederman
optimisation, it's very slow indeed.
=20
I think the correct thing to do is to just admit that cowlinks aren=
't
Post by Eric W. Biederman
POSIX, and instead see the inode number as a way to see whether the=
link
Post by Eric W. Biederman
has been broken or not. Ie just accept the inode number potentially
changing.
=20
That would make "diff" (adn most other uses) ok with this, and anyt=
hign=20
Post by Eric W. Biederman
that isn't, just couldn't be used with cowlinked files.
=20
Really?
=20
tar and cp still need to be taught about them. And if they are not t=
aught
Post by Eric W. Biederman
they will do the wrong thing and hard link the files removing the
copy on write semantics. Which would do ugly thing when you restore
from your backup.
=20
I don't have a problem with the inode number changing when you write =
to
Post by Eric W. Biederman
a file, because I can't think of much that would care either way.
But having the inode number of an open file change sounds like a very
difficult problem to track.=20
=20
Maybe aiming cow links at things like a live cd filesystem is too
ambitious, but it sounds like a minimal clean way to handle all of
the dependencies on writeable files that show up. =20
Either method could work with some restrictions. Either way, some
userspace tools need to be taught about the changes. Personally I
don't care much either way, but the simple trick is already working on
my notebook and the problems are not too bad (not too good either).

BTW: sendfile() for ext[23] is finished, so the actual copy can be
done inside the kernel now. Patch will follow soon. (-EBUSY)

J=F6rn

--=20
To announce that there must be no criticism of the President, or that w=
e
are to stand by the President, right or wrong, is not only unpatriotic
and servile, but is morally treasonable to the American public.
-- Theodore Roosevelt, Kansas City Star, 1918
Tim Connors
2004-04-02 11:54:09 UTC
Permalink
Post by Eric W. Biederman
Post by Linus Torvalds
Post by Jamie Lokier
That is not useful for me or the other people who want to use this to
duplicate large source trees and run "diff" between trees.
"diff" depends on being able to check if files in the two trees are
identical -- by checking whether the inode number and device (and
maybe other stat data) are identical. This allows "diff -ur" between
two cloned trees the size of linux to be quite fast. Without that
optimisation, it's very slow indeed.
I think the correct thing to do is to just admit that cowlinks aren't
POSIX, and instead see the inode number as a way to see whether the link
has been broken or not. Ie just accept the inode number potentially
changing.
That would make "diff" (adn most other uses) ok with this, and anythign
that isn't, just couldn't be used with cowlinked files.
Really?
tar and cp still need to be taught about them. And if they are not taught
they will do the wrong thing and hard link the files removing the
copy on write semantics. Which would do ugly thing when you restore
from your backup.
I don't have a problem with the inode number changing when you write to
a file, because I can't think of much that would care either way.
But having the inode number of an open file change sounds like a very
difficult problem to track.
If the inode doesn't change upon a write, the system is still POSIX,
so nothing breaks (the problem is, cowlinks would be very useful, and
POSIX is very useful - making them incompatible would be a major
bummer)

If you use sendfile() or cowcp() or similar to implement cowlinks,
then only rsync/tar/cp need be taught about its usage. Someone
mentioned that diff will not be optimal with different inodes, because
diff compared inodes. So add some diffstat() that keeps track of some
magical inode like thingy that stays the same for cowed files. Then
teach diff to use diffstat().

Now, you have a system where if your program doesn't explicitly use
cowcp() and diffstat(), everything still works perfectly (we want to
still be able to use old coreutils on a machine that has a new kernel
applied, right? And vice-versa?). All that happens is that your
program may or may not be sub-optimal - you might have to compare
entire files instead of just inodes. Or you may have to copy entire
files instead of just cowing them. But it all still Just Works.

My AU$0.02.
--
TimC -- http://astronomy.swin.edu.au/staff/tconnors/
I bet the human brain is a kludge.
-- Marvin Minsky
Eric W. Biederman
2004-03-25 21:46:21 UTC
Permalink
Post by Jamie Lokier
Post by Eric W. Biederman
One of the rougher patches, in that we don't have persistent inode
numbers. Basically the two files never have the same inode number.
To the user they are always presented as two separate files.
That is not useful for me or the other people who want to use this to
duplicate large source trees and run "diff" between trees.
"diff" depends on being able to check if files in the two trees are
identical -- by checking whether the inode number and device (and
maybe other stat data) are identical. This allows "diff -ur" between
two cloned trees the size of linux to be quite fast. Without that
optimisation, it's very slow indeed.
In the case where cow is implemented as a stackable filesystem it is
easy to discover the changes by looking at the underlying fs instead
of at the cow view. If the file was not changed the file was not
copied.

The reason for the late copy is some programs open files O_RDRW and
only read the file. If those triggered a copy on write when you
opened the file, diff would still need to go to the work of manually
comparing the files.

The underlying idea of copy on write is that you have separate files
that happen to be storing the same data, in the same space. Anytime
you deviate from that is when you are going to get surprised.

The case I care about is sharing the same root filesystem on different
machines, and that must look like 2 separate filesystems.

It is easy to add something like a cowstat or a readcowlink and teach
the few programs that care (i.e. diff, tar,...) how to use it. So I
would rather concentrate on making cow links look like a separate copy
than early optimizations.

Eric
Jamie Lokier
2004-03-27 10:28:28 UTC
Permalink
Post by Eric W. Biederman
It is easy to add something like a cowstat or a readcowlink and teach
the few programs that care (i.e. diff, tar,...) how to use it. So I
would rather concentrate on making cow links look like a separate copy
than early optimizations.
I agree, having each cowlink look like a separate copy, with separate
inode numbers, is best. That _is_ POSIX compatible -- the sharing is
just a storage optimisation, and programs which only use the POSIX API
won't see the difference.

I have no problem with adding cowstat() to diff, and I'm sure other
people will eventually extend rsync and tar to use it, if it becomes
widely used.

It's not the simplest solution, though. The filesystem changes are
non-trivial. (The simplest solution is just an ext2 attribute which
says you can't write to the file if it has >1 links).

-- Jamie
Eric W. Biederman
2004-03-27 21:00:45 UTC
Permalink
Post by Jamie Lokier
It is easy to add something like a cowstat or a readcowlink and tea=
ch
Post by Jamie Lokier
the few programs that care (i.e. diff, tar,...) how to use it. So =
I
Post by Jamie Lokier
would rather concentrate on making cow links look like a separate c=
opy
Post by Jamie Lokier
than early optimizations.
=20
I agree, having each cowlink look like a separate copy, with separate
inode numbers, is best. That _is_ POSIX compatible -- the sharing is
just a storage optimisation, and programs which only use the POSIX AP=
I
Post by Jamie Lokier
won't see the difference.
=20
I have no problem with adding cowstat() to diff, and I'm sure other
people will eventually extend rsync and tar to use it, if it becomes
widely used.
=20
It's not the simplest solution, though. The filesystem changes are
non-trivial. (The simplest solution is just an ext2 attribute which
says you can't write to the file if it has >1 links).
There are two possible implementations strategies for implementing
cow files. You can either start as J=F6rn did with hardlinks, or you
can start with symlinks.

The set of tradeoffs is interesting. When using hardlinks the only
sane thing to do is to change the inode number when you do a copy.
You are limited to normal files, no directories, no symlinks, and the
original must resided on the same filesystem as the copy. In addition
a copy will always have a link count of one. So on that score I
see a hard time getting POSIX semantics out of the a hard link based
cow.

When you start with symlinks the tradeoffs are different. You only
trigger a copy on write when you go through the copy not when you
write through the original. You get distinct inodes for free. They
can be straight forwardly extended to work on symlinks and other node
types. You can have multiple links at the end of the copy, because
symlinks can be hard linked. The original can be stored on another
filesystem. If you don't change the original you get POSIX semantics.

As for simplicity I think the two approaches are roughly equal. =20

As my memory has it the proto implementation I saw using a stackable
fs and cow symlinks was about a 1000 lines. And it was that large
because it was complete (i.e. it did the copies including copying
directories.)

Eric
Jamie Lokier
2004-03-27 21:42:38 UTC
Permalink
Post by Eric W. Biederman
There are two possible implementations strategies for implementing
cow files. You can either start as J=F6rn did with hardlinks, or you
can start with symlinks.
Symlinks have a _big_ problem: move one, or move it's target, and it
no longer links to the same file. That's nothing like the
transparency we need cowlinks to have.

There's a third implementation strategy. Since we're talking in all
cases about adding a new feature to the underlying filesystem, why not
implement separate inodes pointing to an underlying shared inode which
holds the data. (I think it was mentioned earlier in this thread).

The implementation is very similar to symbolic links, but instead of
having symlink inodes, you have cowlink inodes which point directly to
another inode and count as references to it.

That provides POSIX semantics and has none of the caveats you and I
have mentioned for hard links and symbolic links.

Implementation: Creating a cowlink to a non-cowlinked file creates a
new shared inode by cloning the file's inode, converts the original
inode to a cowlink-pointer inode, and creates a new cowlink-pointer
inode.

This provides 100% semantic equivalence to copying: all operations
including hard and symbolic links on the resulting cowlink files act
as if the cowlink operation was a copy, but faster and using less
space.
Post by Eric W. Biederman
As my memory has it the proto implementation I saw using a stackable
fs and cow symlinks was about a 1000 lines. And it was that large
because it was complete (i.e. it did the copies including copying
directories.)
I can see a stackable fs being useful for live CD distributions, where
tmpfs or disk hold the modifications stacked over the original
filesystem.

But mounting an fs for each version of a source tree and keeping track
of all those mounts: that would be incredibly clumsy to use.

You could implement a stackable fs which is mounted once and provides
cowlink operations within the fs. That still be a bit clumsy, but not
so much as to make it unusable for source tree management.

-- Jamie
Eric W. Biederman
2004-03-27 23:45:53 UTC
Permalink
Post by Jamie Lokier
Post by Eric W. Biederman
There are two possible implementations strategies for implementing
cow files. You can either start as J=F6rn did with hardlinks, or y=
ou
Post by Jamie Lokier
Post by Eric W. Biederman
can start with symlinks.
=20
Symlinks have a _big_ problem: move one, or move it's target, and it
no longer links to the same file. That's nothing like the
transparency we need cowlinks to have.
With absolute paths you can move the symlink. But the other side
is still problematic.
=20
Post by Jamie Lokier
There's a third implementation strategy. Since we're talking in all
cases about adding a new feature to the underlying filesystem, why no=
t
Post by Jamie Lokier
implement separate inodes pointing to an underlying shared inode whic=
h
Post by Jamie Lokier
holds the data. (I think it was mentioned earlier in this thread).
=20
The implementation is very similar to symbolic links, but instead of
having symlink inodes, you have cowlink inodes which point directly t=
o
Post by Jamie Lokier
another inode and count as references to it.
=20
That provides POSIX semantics and has none of the caveats you and I
have mentioned for hard links and symbolic links.
=20
Implementation: Creating a cowlink to a non-cowlinked file creates a
new shared inode by cloning the file's inode, converts the original
inode to a cowlink-pointer inode, and creates a new cowlink-pointer
inode.
And to state the obvious, you must make certain your fs has a lot of
inodes because this dramatically changes the inode to fs block ratio.
Post by Jamie Lokier
This provides 100% semantic equivalence to copying: all operations
including hard and symbolic links on the resulting cowlink files act
as if the cowlink operation was a copy, but faster and using less
space.
I like it. It's not too hard and it yields a very nice result.
syscall wise we would need:
int cowlink(const char *oldpath, const char *newpath);
int cstat(const char *filename, struct stat *buf);

Which of course are non POSIX :)
Post by Jamie Lokier
Post by Eric W. Biederman
As my memory has it the proto implementation I saw using a stackabl=
e
Post by Jamie Lokier
Post by Eric W. Biederman
fs and cow symlinks was about a 1000 lines. And it was that large
because it was complete (i.e. it did the copies including copying
directories.)
=20
I can see a stackable fs being useful for live CD distributions, wher=
e
Post by Jamie Lokier
tmpfs or disk hold the modifications stacked over the original
filesystem.
The practical use was getting root on a network filesystem to scale,
and be manageable. But the problem is essentially the same. Although
to guard against network congestion interacting badly with paging we
were actually thinking about implementing copy on read. =20
=20
Post by Jamie Lokier
But mounting an fs for each version of a source tree and keeping trac=
k
Post by Jamie Lokier
of all those mounts: that would be incredibly clumsy to use.
=20
You could implement a stackable fs which is mounted once and provides
cowlink operations within the fs. That still be a bit clumsy, but no=
t
Post by Jamie Lokier
so much as to make it unusable for source tree management.
And that is what the prototype we were playing with was doing. You
could not see the original files at all unless you mounted the base
filesystem somewhere outside the cowfs mount. It was actually quite
close to your special cow inode idea except that it stored data in
symlinks instead of inodes. To the extent of freeing files if all of
the cow links to them were removed, and the files were store on the
same filesystem as the cow links.=20

The addictive thing about the prototype implementation was that you
could do ``ln --cow / /some/other/directory'' and you would have an
atomic snapshot of your filesystem. Definitely not a feature for the
first implementation but certainly something to dream about.

Eric
Eric W. Biederman
2004-03-28 00:43:11 UTC
Permalink
Post by Eric W. Biederman
The addictive thing about the prototype implementation was that you
could do ``ln --cow / /some/other/directory'' and you would have an
atomic snapshot of your filesystem. Definitely not a feature for the
first implementation but certainly something to dream about.
Addictive but broken by design. If any of the files inside your
directory tree have hard links outside of the tree there is no way
short of recursing through all of the subdirectories directories to
tell if a given inode has is in use. Except in the special case
where you are taking a cow copy of the entire filesystem. At which
point a magic mount option is likely a better interface.

Ok that simplifies the long term design a little more. cow
directories cannot work correctly in all cases even when implemented
in the kernel. So the directory walking must still be done in user
space.


Eric
Jamie Lokier
2004-03-28 12:22:42 UTC
Permalink
Post by Eric W. Biederman
Post by Eric W. Biederman
The addictive thing about the prototype implementation was that you
could do ``ln --cow / /some/other/directory'' and you would have an
atomic snapshot of your filesystem. Definitely not a feature for the
first implementation but certainly something to dream about.
Addictive but broken by design. If any of the files inside your
directory tree have hard links outside of the tree there is no way
short of recursing through all of the subdirectories directories to
tell if a given inode has is in use. Except in the special case
where you are taking a cow copy of the entire filesystem. At which
point a magic mount option is likely a better interface.
I don't understand this explanation. Can you explain again? What is
the problem with inodes being in use?

-- Jamie
Eric W. Biederman
2004-03-28 20:07:20 UTC
Permalink
Post by Jamie Lokier
Post by Eric W. Biederman
Post by Eric W. Biederman
The addictive thing about the prototype implementation was that you
could do ``ln --cow / /some/other/directory'' and you would have an
atomic snapshot of your filesystem. Definitely not a feature for the
first implementation but certainly something to dream about.
Addictive but broken by design. If any of the files inside your
directory tree have hard links outside of the tree there is no way
short of recursing through all of the subdirectories directories to
tell if a given inode has is in use. Except in the special case
where you are taking a cow copy of the entire filesystem. At which
point a magic mount option is likely a better interface.
I don't understand this explanation. Can you explain again? What is
the problem with inodes being in use?
A COW on a directory implies a COW on everything in it. So the implication
is an atomic snapshot of that directory and everything below it.

Assuming no files are open and in use. The implementation would
create the cow link on the directory just like it would on a file.
Then when you open/modify a directory or file the cow copies would be
taken up to the point of the original cow directory so you have a
separate directory structure.

All of which works great until you have a file that has one hard link
in your cow directory structure and another hard link outside of
any cow. An application can come in and modify the file through that
second cow link causing problems.

So the problem is not really with open files at the time of the cow
although there is a variation of it there as well. At the time of the
cow it is possible to look through the dcache and find all of the
files that are in the cow directory or a subdirectory of the it. Then
you can make those cow files. But again you run into the problem of
an application using a file through a link that is not a subdirectory
of the cow directory.

So in the presence of hard links doing cow on a directory other than
the root directory can not be implemented correctly short of doing a
complete recursive directory copy at the time you create the cow. At
which point you might as well just copy the directories in user space.
At least the race conditions are easily apparent.

Eric
Jamie Lokier
2004-03-28 23:55:28 UTC
Permalink
Post by Eric W. Biederman
A COW on a directory implies a COW on everything in it. So the implication
is an atomic snapshot of that directory and everything below it.
Assuming no files are open and in use. The implementation would
create the cow link on the directory just like it would on a file.
Then when you open/modify a directory or file the cow copies would be
taken up to the point of the original cow directory so you have a
separate directory structure.
All of which works great until you have a file that has one hard link
in your cow directory structure and another hard link outside of
any cow. An application can come in and modify the file through that
second cow link causing problems.
I don't see that problem (although I see another, see below). The
application will modify only one instance of the file, and it's the
correct instance. If the application writes through the link outside
both trees, or the link inside the original tree, it will only affect
the tree that was cowlinked _from_, which is correct. If the
application writes to the name inside the snapshot tree, it will only
affect that tree, which is also correct.

You cowlinked a directory. That converts the original directory inode
to a cowlink, creates another cowlink, and creates a shared inode
which now contains the directory.

Then you modify the directory or anything below it. That duplicates
the directory, breaking the directory cowlinks and duplicating the
shared directory inode -- so that the two directory cowlink inodes
become normal directory inodes. The directory duplication results in
two directory which are full of cowlinks -- every object in the
original directory is cowlinked by this operation.

A file which was originally hard-linked inside the tree and also
outside both trees retains the correct hard-link identity: the hard
link is simply two directory entries referring to the same inode,
which at all times is the inode visible inside the original tree and
not visible inside the snapshot, cowlinked tree. That inode changes
its underlying representation from file-inode to cowlink-inode
(pointing to a shared file-inode) and back again during these
operations. However, the hard link identity remains correct at all
times. Writing to a file won't ever modify the wrong file.

There is a different problem, though: cowlinking whole trees like that
doesn't preserve hard-linkage _within_ the tree being copied. Each
time a directory is lazily duplicated, every entry in it is cowlinked,
and if two or more entries are hard linked to the same inode, the
cowlinked copies won't share an inode. It's as if you did "cp -p"
instead of "cp -pd". This can be solved easily within a single
directory, but when there are hard linked within the tree from
different directories, it's too hard to solve with lazy directory
duplication, without keeping a lot of extra metadata. (That's the
same metadata that "cp --preserve=links" or "rsync -H" has to keep
track of when copying a whole tree: on my home system, there's so much
of it due to hard links that rsync can't copy my home directory).

So I don't see the problem you describe, where an application could
accidentally modify the wrong file. (Perhaps my imagined mechanism
for cowlinking directories is different from yours). In particular, I
see no problem at all with hard links outside the cowlinked trees:
they would be fine.

But I see a different problem: the equivalent of something
semantically equivalent to "cp -pr" is fine and fast, but "cp -dpr"
(aka. "cp -a") must, unless it's quite complicated with filesystem
metadata, duplicate the whole directories immediately rather than
lazily, or at least scan them looking for hard links.

-- Jamie
Eric W. Biederman
2004-03-29 01:31:34 UTC
Permalink
Post by Jamie Lokier
Post by Eric W. Biederman
All of which works great until you have a file that has one hard link
in your cow directory structure and another hard link outside of
any cow. An application can come in and modify the file through that
second cow link causing problems.
I don't see that problem (although I see another, see below). The
application will modify only one instance of the file, and it's the
correct instance. If the application writes through the link outside
both trees, or the link inside the original tree, it will only affect
the tree that was cowlinked _from_, which is correct. If the
application writes to the name inside the snapshot tree, it will only
affect that tree, which is also correct.
What I see is a race. An application may write through the link outside
both trees before any of the links is marked cow. With the result
that you don't have a snapshot of your data.
Post by Jamie Lokier
You cowlinked a directory. That converts the original directory inode
to a cowlink, creates another cowlink, and creates a shared inode
which now contains the directory.
Then you modify the directory or anything below it. That duplicates
the directory, breaking the directory cowlinks and duplicating the
shared directory inode -- so that the two directory cowlink inodes
become normal directory inodes. The directory duplication results in
two directory which are full of cowlinks -- every object in the
original directory is cowlinked by this operation.
A file which was originally hard-linked inside the tree and also
outside both trees retains the correct hard-link identity: the hard
link is simply two directory entries referring to the same inode,
which at all times is the inode visible inside the original tree and
not visible inside the snapshot, cowlinked tree. That inode changes
its underlying representation from file-inode to cowlink-inode
(pointing to a shared file-inode) and back again during these
operations. However, the hard link identity remains correct at all
times. Writing to a file won't ever modify the wrong file.
Correct to a point. And we seem to be imagining the same operations.
However while you will always modify the correct file, as the metadata
is correct. There is no guarantee that the data will be correct. The
file will become a cow file only after it is modified or it's
containing directory is modified. Thus you can have data in the file
that was written after the snapshot operation finished, but before the
individual file itself is marked cow.
Post by Jamie Lokier
There is a different problem, though: cowlinking whole trees like that
doesn't preserve hard-linkage _within_ the tree being copied.
I see a different problem: the equivalent of something
semantically equivalent to "cp -pr" is fine and fast, but "cp -dpr"
(aka. "cp -a") must, unless it's quite complicated with filesystem
metadata, duplicate the whole directories immediately rather than
lazily, or at least scan them looking for hard links.
Now that you bring it out I see that problem as well. I have seen it
in other proposed implementations as well. Keeping hard links linked
requires for some amount of context to be maintained for the entire
copy operation. If necessary you could keep that context where it is
available to the lazy copy but it is far from trivial.

In short lazy copying for creating snapshots is dangerous. The
data you are copying may be modified before you are done. It is
difficult to maintain state across the entire copy.

All of which sounds like a job for user space to me.

Eric
Jamie Lokier
2004-03-29 12:36:58 UTC
Permalink
The file will become a cow file only after it is modified or it's
containing directory is modified.
Eh? The file (or directory) must be labelled as a cowlinked file the
moment you make the cowlink, not when the data is modified. It's
_breaking_ the cowlink that happens when the data (or directory
contents) are modified.
Thus you can have data in the
file that was written after the snapshot operation finished, but
before the individual file itself is marked cow.
The creation of a cowlink should be atomic w.r.t. writing.

Specifically, the operation which moves the contents of a non-cowlink
inode to a newly created shared inode, and converts the original
non-cowlink inode to a cowlink inode, should be atomic.

Is there an unavoidable race condition? I don't see one.

-- Jamie
Eric W. Biederman
2004-03-29 19:36:39 UTC
Permalink
Post by Jamie Lokier
The file will become a cow file only after it is modified or it's
containing directory is modified.
Eh? The file (or directory) must be labelled as a cowlinked file the
moment you make the cowlink, not when the data is modified. It's
_breaking_ the cowlink that happens when the data (or directory
contents) are modified.
You cowlinked a directory. That converts the original directory inode
to a cowlink, creates another cowlink, and creates a shared inode
which now contains the directory.
Then you modify the directory or anything below it. That duplicates
the directory, breaking the directory cowlinks and duplicating the
shared directory inode -- so that the two directory cowlink inodes
become normal directory inodes. The directory duplication results in
two directory which are full of cowlinks -- every object in the
original directory is cowlinked by this operation.
This mechanism does not label all files below a directory as cowlinked
the moment the cowlink.

Therefore cowlink on a directory as described may be an interesting
operation but it is not an atomic snapshot of the directory and
it's contents.
Post by Jamie Lokier
Thus you can have data in the
file that was written after the snapshot operation finished, but
before the individual file itself is marked cow.
The creation of a cowlink should be atomic w.r.t. writing.
Specifically, the operation which moves the contents of a non-cowlink
inode to a newly created shared inode, and converts the original
non-cowlink inode to a cowlink inode, should be atomic.
Is there an unavoidable race condition? I don't see one.
Only if the following apply.

- A correct cow on a directory is considered to be an atomic snapshot
of both the directory and everything below it.
- You break the cow on the directories on demand, as above.

Scenario. The directory tree looks like:

/dir1 (inode 1)/link1 (inode 10)
/dir2 (inode 2)/link2 (inode 10)

Then a cow link is performed:

ln --cow dir2 cow1

Resulting in a directory tree that looks like:
/dir1 (inode 1)/link1 (inode 10)
/dir2 (inode 2) -> (inode 3)/link2 (inode 10)
/cow1 (inode 4) -> (inode 3)/link2 (inode 10)


Then we have several things that could happen.
Scenario A: fd = open(/cow1/link2); write(fd, ....);
Scenario B: fd = open(/dir2/link2); write(fd, ....);
Scenario C: fd = open(/dir1/link1); write(fd, ....);

**
In Scenario A the open breaks the cow on the directory, so that
the cow copy can have it's own inode as:

/dir1 (inode 1)/link1 (inode 10) -> (inode 11)
/dir2 (inode 2)/link2 (inode 10) -> (inode 11)
/cow1 (inode 3)/link2 (inode 12) -> (inode 11)

Then the write occurs the cow is broken and the tree looks like:

/dir1 (inode 1)/link1 (inode 10)
/dir2 (inode 2)/link2 (inode 10) -> (inode 11)
/cow1 (inode 3)/link2 (inode 12)

**
In Scenario B the open breaks the cow on the directory, so that
the cow copy can potentially have it's own inode as:

/dir1 (inode 1)/link1 (inode 10) -> (inode 11)
/dir2 (inode 2)/link2 (inode 10) -> (inode 11)
/cow1 (inode 3)/link2 (inode 12) -> (inode 11)

Then the write occurs the cow is broken and the tree looks like:

/dir1 (inode 1)/link1 (inode 10)
/dir2 (inode 2)/link2 (inode 10)
/cow1 (inode 3)/link2 (inode 12) -> (inode 11)


**
In Scenario C their is no open for the cow to break and
things proceed normally the directory tree remains looking like:

/dir1 (inode 1)/link1 (inode 10)
/dir2 (inode 2) -> (inode 3)/link2 (inode 10)
/cow1 (inode 3) -> (inode 3)/link2 (inode 10)

Then when the write occurs there is not cow to break, and the
tree remains looking like:

/dir1 (inode 1)/link1 (inode 10)
/dir2 (inode 2) -> (inode 3)/link2 (inode 10)
/cow1 (inode 3) -> (inode 3)/link2 (inode 10)

So I see a problem with Scenario C. Perhaps you can refute it.

Eric
Jamie Lokier
2004-03-29 23:05:37 UTC
Permalink
Post by Eric W. Biederman
So I see a problem with Scenario C. Perhaps you can refute it.
Ick. You're right. I cannot refute it.

Fwiw, I would have broken the directory cows on the first write, not
the open.

Thus, snapshots using lazy directory copies requires either that there
are no hard links of the type you described (e.g. when snapshotting
the root of the filesystem), or rather complex metadata to track the
hard links, not dissimilar to the metadata needed to preserve hard
links _within_ the snapshot. They both seem far too complex to be worth it.

Hard links just don't play well with lazy cowlinked directories.

They are fine with non-lazy directory cowlinking, where the whole
directory tree is duplicated and only files are cow'd. Note that this
doesn't apply to the original implementation which used hard links
with a special flag: maintaining hard links in conjunction with
cowlinks requires the inode duplication we've been talking about.

Btw, if we have cowlinks implemented using inode duplication, then it
isn't necessary for both inodes to have the same metadata such as
mtime, ctime, mode etc. Only the data is shared. That means the
sendfile() system call could conceivably be overloaded, meaning to
copy the data, and let "cp --cow" decide whether it wants to copy the
metadata (same as as "cp -rp" or "cp -rpd"), or not (same as "cp -r").

-- Jamie
Eric W. Biederman
2004-03-29 23:58:33 UTC
Permalink
Post by Jamie Lokier
Post by Eric W. Biederman
So I see a problem with Scenario C. Perhaps you can refute it.
Ick. You're right. I cannot refute it.
The upside is since we can't it makes the implementation much easier :)
Post by Jamie Lokier
Fwiw, I would have broken the directory cows on the first write, not
the open.
I did it only so that programs do not see inode numbers change under
them. For the copy that gets the original inode numbers delaying
until write is fine, for the copy with the new inode numbers to avoid
surprises you need to break the cow on the directory and move it to
the files on readdir, stat, and open.
Post by Jamie Lokier
Thus, snapshots using lazy directory copies requires either that there
are no hard links of the type you described (e.g. when snapshotting
the root of the filesystem), or rather complex metadata to track the
hard links, not dissimilar to the metadata needed to preserve hard
links _within_ the snapshot. They both seem far too complex to be worth it.
Agreed.
Post by Jamie Lokier
Hard links just don't play well with lazy cowlinked directories.
They are fine with non-lazy directory cowlinking, where the whole
directory tree is duplicated and only files are cow'd. Note that this
doesn't apply to the original implementation which used hard links
with a special flag: maintaining hard links in conjunction with
cowlinks requires the inode duplication we've been talking about.
? The problem is lazy propagation of the cow flag. The implementation
for ordinary files does not matter. Only the implementation
for directories matters.
Post by Jamie Lokier
Btw, if we have cowlinks implemented using inode duplication, then it
isn't necessary for both inodes to have the same metadata such as
mtime, ctime, mode etc. Only the data is shared. That means the
sendfile() system call could conceivably be overloaded, meaning to
copy the data, and let "cp --cow" decide whether it wants to copy the
metadata (same as as "cp -rp" or "cp -rpd"), or not (same as "cp -r").
Sendfile feels about right but it has a few issues that complication
something like this. It works on file descriptors, and it takes
a length parameter.

There is a lot of room for things to go wrong when implementing
cowlink(const char *oldname, const char *newname) in user space.

Since the semantics are a delayed sendfile in other ways sendfile
feels like a good fit.

Eric
Denis Vlasenko
2004-03-29 07:45:28 UTC
Permalink
Post by Jamie Lokier
Post by Eric W. Biederman
A COW on a directory implies a COW on everything in it. So the
implication is an atomic snapshot of that directory and everything below
it.
Assuming no files are open and in use. The implementation would
create the cow link on the directory just like it would on a file.
Then when you open/modify a directory or file the cow copies would be
taken up to the point of the original cow directory so you have a
separate directory structure.
All of which works great until you have a file that has one hard link
in your cow directory structure and another hard link outside of
any cow. An application can come in and modify the file through that
second cow link causing problems.
I don't see that problem (although I see another, see below). The
application will modify only one instance of the file, and it's the
correct instance. If the application writes through the link outside
both trees, or the link inside the original tree, it will only affect
the tree that was cowlinked _from_, which is correct. If the
application writes to the name inside the snapshot tree, it will only
affect that tree, which is also correct.
You cowlinked a directory. That converts the original directory inode
to a cowlink, creates another cowlink, and creates a shared inode
which now contains the directory.
Then you modify the directory or anything below it. That duplicates
the directory, breaking the directory cowlinks and duplicating the
shared directory inode -- so that the two directory cowlink inodes
become normal directory inodes. The directory duplication results in
two directory which are full of cowlinks -- every object in the
original directory is cowlinked by this operation.
A file which was originally hard-linked inside the tree and also
outside both trees retains the correct hard-link identity: the hard
link is simply two directory entries referring to the same inode,
which at all times is the inode visible inside the original tree and
not visible inside the snapshot, cowlinked tree. That inode changes
its underlying representation from file-inode to cowlink-inode
(pointing to a shared file-inode) and back again during these
operations. However, the hard link identity remains correct at all
times. Writing to a file won't ever modify the wrong file.
There is a different problem, though: cowlinking whole trees like that
doesn't preserve hard-linkage _within_ the tree being copied. Each
time a directory is lazily duplicated, every entry in it is cowlinked,
and if two or more entries are hard linked to the same inode, the
cowlinked copies won't share an inode. It's as if you did "cp -p"
instead of "cp -pd". This can be solved easily within a single
directory, but when there are hard linked within the tree from
different directories, it's too hard to solve with lazy directory
duplication, without keeping a lot of extra metadata. (That's the
same metadata that "cp --preserve=links" or "rsync -H" has to keep
track of when copying a whole tree: on my home system, there's so much
of it due to hard links that rsync can't copy my home directory).
Personally I'll dump hardlinks functionality entirely if I'll get
working cowlinks instead. Most of the time, hardlinks used to save space
and/or provide alternate names to programs/whatever,
but then you have to be damn careful to remember what is linked where.

Softlinks can be used insteal, and actually they are better for
alternate naming purpose, with them I easily see that it is
a link and where it points.

But cowlinks are *ultimate* space saving stuff because they never
make you fail and update wrong file, as it happens with hardlinks.
It just all works the Right Way.
--
vda
Jamie Lokier
2004-03-29 12:40:15 UTC
Permalink
Actually, there's 4th strategy, too. You could implement sharing at block level.
Block free bitmap would become bigger, but you could do some tricks to keep it
at ~8 bits per block...
For sharing source trees and such, and even for COW overlays of
/usr/lib and /usr/bin, that would have no benefit: you never write to
just part of a source or object file.

For databases (including things like the RPM database), and snapshots
of filesystems, it would be more useful.

-- Jamie
Pavel Machek
2004-03-29 09:28:36 UTC
Permalink
Hi!
Post by Jamie Lokier
Post by Eric W. Biederman
There are two possible implementations strategies for implementing
cow files. You can either start as Jrn did with hardlinks, or you
can start with symlinks.
...
Post by Jamie Lokier
There's a third implementation strategy. Since we're talking in all
cases about adding a new feature to the underlying filesystem, why not
implement separate inodes pointing to an underlying shared inode which
holds the data. (I think it was mentioned earlier in this thread).
Actually, there's 4th strategy, too. You could implement sharing at block level.
Block free bitmap would become bigger, but you could do some tricks to keep it
at ~8 bits per block...
--
64 bytes from 195.113.31.123: icmp_seq=28 ttl=51 time=448769.1 ms
Continue reading on narkive:
Loading...