Contents

  1. The Extended File System Version 2 (ext2)
  2. The Extended File System Version 4 (ext4)

The Extended File System Version 2 (ext2)

Back in myyyyyyyyy day (queue old man here), my first Linux distribution used the ext2 file system. This file system is a fairly generic, indexed file system. It does not support journaling and is not generally suitable for solid state drives. It has a superblock that describes the file system, but it contains much more information than the Minix file system.

Superblock

struct ext2_super_block {
/*000*/	__u32	s_inodes_count;		/* Inodes count */
	__u32	s_blocks_count;		/* Blocks count */
	__u32	s_r_blocks_count;	/* Reserved blocks count */
	__u32	s_free_blocks_count;	/* Free blocks count */
/*010*/	__u32	s_free_inodes_count;	/* Free inodes count */
	__u32	s_first_data_block;	/* First Data Block */
	__u32	s_log_block_size;	/* Block size */
	__u32	s_log_cluster_size;	/* Allocation cluster size */
/*020*/	__u32	s_blocks_per_group;	/* # Blocks per group */
	__u32	s_clusters_per_group;	/* # Fragments per group */
	__u32	s_inodes_per_group;	/* # Inodes per group */
	__u32	s_mtime;		/* Mount time */
/*030*/	__u32	s_wtime;		/* Write time */
	__u16	s_mnt_count;		/* Mount count */
	__s16	s_max_mnt_count;	/* Maximal mount count */
	__u16	s_magic;		/* Magic signature */
	__u16	s_state;		/* File system state */
	__u16	s_errors;		/* Behaviour when detecting errors */
	__u16	s_minor_rev_level;	/* minor revision level */
/*040*/	__u32	s_lastcheck;		/* time of last check */
	__u32	s_checkinterval;	/* max. time between checks */
	__u32	s_creator_os;		/* OS */
	__u32	s_rev_level;		/* Revision level */
/*050*/	__u16	s_def_resuid;		/* Default uid for reserved blocks */
	__u16	s_def_resgid;		/* Default gid for reserved blocks */
	/*
	 * These fields are for EXT2_DYNAMIC_REV superblocks only.
	 *
	 * Note: the difference between the compatible feature set and
	 * the incompatible feature set is that if there is a bit set
	 * in the incompatible feature set that the kernel doesn't
	 * know about, it should refuse to mount the filesystem.
	 *
	 * e2fsck's requirements are more strict; if it doesn't know
	 * about a feature in either the compatible or incompatible
	 * feature set, it must abort and not try to meddle with
	 * things it doesn't understand...
	 */
	__u32	s_first_ino;		/* First non-reserved inode */
	__u16   s_inode_size;		/* size of inode structure */
	__u16	s_block_group_nr;	/* block group # of this superblock */
	__u32	s_feature_compat;	/* compatible feature set */
/*060*/	__u32	s_feature_incompat;	/* incompatible feature set */
	__u32	s_feature_ro_compat;	/* readonly-compatible feature set */
/*068*/	__u8	s_uuid[16] __nonstring;		/* 128-bit uuid for volume */
/*078*/	__u8	s_volume_name[EXT2_LABEL_LEN] __nonstring;	/* volume name, no NUL? */
/*088*/	__u8	s_last_mounted[64] __nonstring;	/* directory last mounted on, no NUL? */
/*0c8*/	__u32	s_algorithm_usage_bitmap; /* For compression */
	/*
	 * Performance hints.  Directory preallocation should only
	 * happen if the EXT2_FEATURE_COMPAT_DIR_PREALLOC flag is on.
	 */
	__u8	s_prealloc_blocks;	/* Nr of blocks to try to preallocate*/
	__u8	s_prealloc_dir_blocks;	/* Nr to preallocate for dirs */
	__u16	s_reserved_gdt_blocks;	/* Per group table for online growth */
	/*
	 * Journaling support valid if EXT2_FEATURE_COMPAT_HAS_JOURNAL set.
	 */
/*0d0*/	__u8	s_journal_uuid[16] __nonstring;	/* uuid of journal superblock */
/*0e0*/	__u32	s_journal_inum;		/* inode number of journal file */
	__u32	s_journal_dev;		/* device number of journal file */
	__u32	s_last_orphan;		/* start of list of inodes to delete */
/*0ec*/	__u32	s_hash_seed[4];		/* HTREE hash seed */
/*0fc*/	__u8	s_def_hash_version;	/* Default hash version to use */
	__u8	s_jnl_backup_type;	/* Default type of journal backup */
	__u16	s_desc_size;		/* Group desc. size: INCOMPAT_64BIT */
/*100*/	__u32	s_default_mount_opts;	/* default EXT2_MOUNT_* flags used */
	__u32	s_first_meta_bg;	/* First metablock group */
	__u32	s_mkfs_time;		/* When the filesystem was created */
/*10c*/	__u32	s_jnl_blocks[17];	/* Backup of the journal inode */
/*150*/	__u32	s_blocks_count_hi;	/* Blocks count high 32bits */
	__u32	s_r_blocks_count_hi;	/* Reserved blocks count high 32 bits*/
	__u32	s_free_blocks_hi;	/* Free blocks count */
	__u16	s_min_extra_isize;	/* All inodes have at least # bytes */
	__u16	s_want_extra_isize;	/* New inodes should reserve # bytes */
/*160*/	__u32	s_flags;		/* Miscellaneous flags */
	__u16	s_raid_stride;		/* RAID stride in blocks */
	__u16	s_mmp_update_interval;  /* # seconds to wait in MMP checking */
	__u64	s_mmp_block;		/* Block for multi-mount protection */
/*170*/	__u32	s_raid_stripe_width;	/* blocks on all data disks (N*stride)*/
	__u8	s_log_groups_per_flex;	/* FLEX_BG group size */
	__u8	s_checksum_type;	/* metadata checksum algorithm */
	__u8	s_encryption_level;	/* versioning level for encryption */
	__u8	s_reserved_pad;		/* Padding to next 32bits */
	__u64	s_kbytes_written;	/* nr of lifetime kilobytes written */
/*180*/	__u32	s_snapshot_inum;	/* Inode number of active snapshot */
	__u32	s_snapshot_id;		/* sequential ID of active snapshot */
	__u64	s_snapshot_r_blocks_count; /* active snapshot reserved blocks */
/*190*/	__u32	s_snapshot_list;	/* inode number of disk snapshot list */

	__u32	s_error_count;		/* number of fs errors */
	__u32	s_first_error_time;	/* first time an error happened */
	__u32	s_first_error_ino;	/* inode involved in first error */
/*1a0*/	__u64	s_first_error_block;	/* block involved in first error */
	__u8	s_first_error_func[32] __nonstring;	/* function where error hit, no NUL? */
/*1c8*/	__u32	s_first_error_line;	/* line number where error happened */
	__u32	s_last_error_time;	/* most recent time of an error */
/*1d0*/	__u32	s_last_error_ino;	/* inode involved in last error */
	__u32	s_last_error_line;	/* line number where error happened */
	__u64	s_last_error_block;	/* block involved of last error */
/*1e0*/	__u8	s_last_error_func[32] __nonstring;	/* function where error hit, no NUL? */

/*200*/	__u8	s_mount_opts[64] __nonstring;	/* default mount options, no NUL? */
/*240*/	__u32	s_usr_quota_inum;	/* inode number of user quota file */
	__u32	s_grp_quota_inum;	/* inode number of group quota file */
	__u32	s_overhead_clusters;	/* overhead blocks/clusters in fs */
/*24c*/	__u32	s_backup_bgs[2];	/* If sparse_super2 enabled */
/*254*/	__u8	s_encrypt_algos[4];	/* Encryption algorithms in use  */
/*258*/	__u8	s_encrypt_pw_salt[16];	/* Salt used for string2key algorithm */
/*268*/	__le32	s_lpf_ino;		/* Location of the lost+found inode */
	__le32  s_prj_quota_inum;	/* inode for tracking project quota */
/*270*/	__le32	s_checksum_seed;	/* crc32c(orig_uuid) if csum_seed set */
/*274*/	__u8	s_wtime_hi;
	__u8	s_mtime_hi;
	__u8	s_mkfs_time_hi;
	__u8	s_lastcheck_hi;
	__u8	s_first_error_time_hi;
	__u8	s_last_error_time_hi;
	__u8	s_first_error_errcode;
	__u8    s_last_error_errcode;
/*27c*/ __le16	s_encoding;		/* Filename charset encoding */
	__le16	s_encoding_flags;	/* Filename charset encoding flags */
	__le32	s_reserved[95];		/* Padding to the end of the block */
/*3fc*/	__u32	s_checksum;		/* crc32c(superblock) */
};

Block Groups

To provide failure protection, the ext2 file system uses block groups. Each block group has a descriptor that tells us information about that particular block group. The descriptor contains pointers to the index of the block it is pointing to. For example, the block_bitmap is a 32-bit number that tells us which block the block_bitmap is located in.

Right after the super block is the list of group descriptors. We can determine the number of descriptors by calculating the number of blocks and the number of blocks per group.

constexpr uint64_t NUM_BLOCK_GROUPS(const ext2_super_block &sb) 
{
    return (sb.s_blocks_count - 1) / sb.s_blocks_per_group + 1;
}

Each group descriptor has the following structure and is 32 bytes.

struct ext2_group_desc
{
    __u32  bg_block_bitmap;         /* Blocks bitmap block */
    __u32  bg_inode_bitmap;         /* Inodes bitmap block */
    __u32  bg_inode_table;          /* Inodes table block */
    __u16  bg_free_blocks_count;    /* Free blocks count */
    __u16  bg_free_inodes_count;    /* Free inodes count */
    __u16  bg_used_dirs_count;      /* Directories count */
    __u16  bg_flags;
    __u32  bg_exclude_bitmap_lo;    /* Exclude bitmap for snapshots */
    __u16  bg_block_bitmap_csum_lo; /* crc32c(s_uuid+grp_num+bitmap) LSB */
    __u16  bg_inode_bitmap_csum_lo; /* crc32c(s_uuid+grp_num+bitmap) LSB */
    __u16  bg_itable_unused;        /* Unused inodes count */
    __u16  bg_checksum;             /* crc16(s_uuid+group_num+group_desc)*/
};

All blocks are specified as absolute blocks, which means we can use a simple formula to find the logical byte offset.

constexpr uint64_t BLOCK_OFFSET(uint64_t block, uint32_t blocksize)
{
    return 1024 + (block - 1) * blocksize;
}

The first 1024 bytes is the boot block, and the blocksize can come from the superblock (1024 << sb.s_log_block_size). For example, we can find the inode table by seeking to BLOCK_OFFSET(bg.bg_inode_table, 1024 << sb.s_log_block_size).

The bitmap fields are not the bitmaps themselves, but instead, they refer to the block number where the bitmaps can be found. The bitmaps here are identical to how the bitmaps in Minix work.


Inode

A table of inodes floats in the ext2 file system. We can locate it using the bg_inode_table from the block group descriptor, which is a 32-bit index representing the block that contains the inode table. The inodes are packed one after another in one or more blocks. We know how many inodes belong to a group via the superblock’s sb.s_inodes_per_group. So, if we are given an inode number, we must divide it by sb.s_inodes_per_group to determine which group we need to look at.

struct ext2_inode {
/*00*/	__u16	i_mode;		/* File mode */
	__u16	i_uid;		/* Low 16 bits of Owner Uid */
	__u32	i_size;		/* Size in bytes */
	__u32	i_atime;	/* Access time */
	__u32	i_ctime;	/* Inode change time */
/*10*/	__u32	i_mtime;	/* Modification time */
	__u32	i_dtime;	/* Deletion Time */
	__u16	i_gid;		/* Low 16 bits of Group Id */
	__u16	i_links_count;	/* Links count */
	__u32	i_blocks;	/* Blocks count */
/*20*/	__u32	i_flags;	/* File flags */
	union {
		struct {
			__u32	l_i_version; /* was l_i_reserved1 */
		} linux1;
		struct {
			__u32  h_i_translator;
		} hurd1;
	} osd1;				/* OS dependent 1 */
/*28*/	__u32	i_block[EXT2_N_BLOCKS];/* Pointers to blocks */
/*64*/	__u32	i_generation;	/* File version (for NFS) */
	__u32	i_file_acl;	/* File ACL */
	__u32	i_size_high;
/*70*/	__u32	i_faddr;	/* Fragment address */
	union {
		struct {
			__u16	l_i_blocks_hi;
			__u16	l_i_file_acl_high;
			__u16	l_i_uid_high;	/* these 2 fields    */
			__u16	l_i_gid_high;	/* were reserved2[0] */
			__u16	l_i_checksum_lo; /* crc32c(uuid+inum+inode) */
			__u16	l_i_reserved;
		} linux2;
		struct {
			__u8	h_i_frag;	/* Fragment number */
			__u8	h_i_fsize;	/* Fragment size */
			__u16	h_i_mode_high;
			__u16	h_i_uid_high;
			__u16	h_i_gid_high;
			__u32	h_i_author;
		} hurd2;
	} osd2;				/* OS dependent 2 */
};

Just like the Minix 3 file system, the ext2 file system uses an index system. However, the i_block[] array in the ext2 is slightly bigger. i_block[0:11] are direct pointers, i_block[12] is a singly indirect pointer, i_block[13] is a doubly indirect pointer, and i_block[14] is a triply indirect pointer.

The special inodes are located from inodes 1 through 10.

/*
 * Special inode numbers
 */
#define EXT2_BAD_INO             1 /* Bad blocks inode */
#define EXT2_ROOT_INO            2 /* Root inode */
#define EXT4_USR_QUOTA_INO       3 /* User quota inode */
#define EXT4_GRP_QUOTA_INO       4 /* Group quota inode */
#define EXT2_BOOT_LOADER_INO     5 /* Boot loader inode */
#define EXT2_UNDEL_DIR_INO       6 /* Undelete directory inode */
#define EXT2_RESIZE_INO          7 /* Reserved group descriptors inode */
#define EXT2_JOURNAL_INO         8 /* Journal inode */
#define EXT2_EXCLUDE_INO         9 /* The "exclude" inode, for snapshots */
#define EXT4_REPLICA_INO        10 /* Used by non-upstream feature */

As you can see above, inode #2 is the root inode (the top-level directory).


Directory Entries

A directory entry can be determined via the inode’s mode. The S_IFDIR (0o40000) will be set for any directory (this is bit index 14). You can check this by masking the mode with S_IFMT(0o170000) and then comparing it to S_IFDIR (0o40000) or S_IFREG (0o100000). There are others, such as a FIFO, socket, and so forth, but we will not use those.

#define EXT2_NAME_LEN 255
struct ext2_dir_entry {
   __u32  inode;                /* Offset 0, Inode number */
   __u16  rec_len;              /* Offset 4, Directory entry length */
   __u16  file_type_name_len;   /* Offset 6, file type and name length */
   char   name[EXT2_NAME_LEN];  /* Offset 8, File name */
};

The directory entry has an inode number. This tells us what inode covers which directory entry. The rec_len is the length of the structure on the disk. To get to the next directory entry, we would seek dirent.rec_len bytes. The file_type_name_len is a weird field. Initially, they split this into two __u8 fields, but that led to endianness problems. So, it is all one field. The upper 8 bits of this field contains the file type, and the lower 8 bits of this field contains how many characters are in the file name.

int file_type = dirent.file_type_name_len >> 8;
int name_len = dirent.file_type_name_len & 0xff;
/*
 * Ext2 directory file types.  Only the low 3 bits are used.  The
 * other bits are reserved for now.
 */
#define EXT2_FT_UNKNOWN     0 /* 0b000 */
#define EXT2_FT_REG_FILE    1 /* 0b001 */
#define EXT2_FT_DIR         2 /* 0b010 */
#define EXT2_FT_CHRDEV      3 /* 0b011 */
#define EXT2_FT_BLKDEV      4 /* 0b100 */
#define EXT2_FT_FIFO        5 /* 0b101 */
#define EXT2_FT_SOCK        6 /* 0b110 */
#define EXT2_FT_SYMLINK     7 /* 0b111 */

#define EXT2_FT_MAX         8

This system is faster, since we can determine the file type directly from the directory entry instead of having to locate the inode and looking at its mode.

Unlike Minix, the directory entries have a variable length. You can see there are two length fields: (1) rec_len and (2) name_len. The rec_len is the total size of the directory entry record whereas name_len is the number of bytes being used in the name[] field. The name_len can be a maximum of 255 bytes.

Each directory entry follow the first, however, recall that each directory entry has a different size, so we must add rec_len to the disk offset to find the next entry. The entry table ends with a dummy tail, which has a structure similar to the directory entry. The tail is included in the inode size! So, when reading directory entries, read up to inode.i_size – 12. Then, read the 12-byte tail and check to see if the 0xDE00 is present. If it isn’t, there is an error or a bad block. Unless the directory entry is relevant to what we’re trying to do, we can just ignore that error.

/*
 * This is a bogus directory entry at the end of each leaf block that
 * records checksums.
 */
struct ext2_dir_entry_tail {
   __u32  det_reserved_zero1;    /* Pretend to be unused */
   __u16  det_rec_len;           /* 12 */
   __u16  det_reserved_name_len; /* 0xDE00, fake namelen/filetype */
   __u32  det_checksum;          /* crc32c(uuid+inode+dirent) */
};

WARNING: When reading directory entries, it may span more than one block, and those blocks might not be contiguous!

Files

A file is just a set of data held in blocks. We can tell the size of a file by the inode (i_size). The actual data is stored in blocks, which are pointed to by the i_block[] pointers.


The Extended File System Version 4 (ext4)

The ext4 file system was created in 2007 to solve the problems with the ext2 and ext3 file systems. Yes, there is an ext3, but we’re skipping it.

Ext4 modifies ext2/ext3 to support larger files, larger disks, and faster lookups. The inode grows to 256 bytes to support nanosecond timestamping, the superblock contains additional information, and the block pointers can now become extent pointers (see below).


Changes from Ext2

To support the new features, the ext4 file system made some changes to the basic structures of the ext2 file system to include a bigger group descriptor and a bigger inode. Furthermore, ext4 adds a block mechanism known as extents, which are described below.

Bigger Group Descriptor

The group descriptor’s size grows from 32 bytes to 64 bytes in the ext4 file system and has the following structure.

struct ext4_group_desc
{
    __u32 bg_block_bitmap;        /* Blocks bitmap block */
    __u32 bg_inode_bitmap;	  /* Inodes bitmap block */
    __u32 bg_inode_table;         /* Inodes table block */
    __u16 bg_free_blocks_count;	  /* Free blocks count */
    __u16 bg_free_inodes_count;	  /* Free inodes count */
    __u16 bg_used_dirs_count;     /* Directories count */
    __u16 bg_flags;               /* EXT4_BG_flags (INODE_UNINIT, etc) */
    __u32 bg_exclude_bitmap_lo;	  /* Exclude bitmap for snapshots */
    __u16 bg_block_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bitmap) LSB */
    __u16 bg_inode_bitmap_csum_lo;/* crc32c(s_uuid+grp_num+bitmap) LSB */
    __u16 bg_itable_unused;       /* Unused inodes count */
    __u16 bg_checksum;            /* crc16(sb_uuid+group+desc) */
    __u32 bg_block_bitmap_hi;     /* Blocks bitmap block MSB */
    __u32 bg_inode_bitmap_hi;     /* Inodes bitmap block MSB */
    __u32 bg_inode_table_hi;      /* Inodes table block MSB */
    __u16 bg_free_blocks_count_hi;/* Free blocks count MSB */
    __u16 bg_free_inodes_count_hi;/* Free inodes count MSB */
    __u16 bg_used_dirs_count_hi;  /* Directories count MSB */
    __u16 bg_itable_unused_hi;    /* Unused inodes count MSB */
    __u32 bg_exclude_bitmap_hi;   /* Exclude bitmap block MSB */
    __u16 bg_block_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+bitmap) MSB */
    __u16 bg_inode_bitmap_csum_hi;/* crc32c(s_uuid+grp_num+bitmap) MSB */
    __u32 bg_reserved;
};

The newer structures have the fields with _hi. These need to be put into a 64-bit value, even though they are only 48 bits. We can put these into their correct place by shifting:

constexpr uint64_t INODE_TABLE_BLOCK(const ext4_group_desc &bg) {
    uint64_t itab_lo = bg.bg_inode_table;
    uint64_t itab_hi = bg.bg_inode_table_hi;
    return itab_lo | (itab_hi << 32);
}

The important reason I took the steps I did above without condensing the code is because we want the left shift (<<) to be done on a 64-bit value and NOT a 32-bit or even 16-bit value. Recall that shifting << 32 would always be zero for a value less than or equal to 32 bits.


Bigger Inode

The inode adds extra checksum sizes and 64-bit time signatures to support nanosecond timestamps. The bigger inode can be up to 256 bytes, but it is specified by the superblock’s s_inode_size field.

/*
 * Permanent part of an large inode on the disk
 */
struct ext2_inode_large {
/*00*/	__u16	i_mode;		/* File mode */
	__u16	i_uid;		/* Low 16 bits of Owner Uid */
	__u32	i_size;		/* Size in bytes */
	__u32	i_atime;	/* Access time */
	__u32	i_ctime;	/* Inode Change time */
/*10*/	__u32	i_mtime;	/* Modification time */
	__u32	i_dtime;	/* Deletion Time */
	__u16	i_gid;		/* Low 16 bits of Group Id */
	__u16	i_links_count;	/* Links count */
	__u32	i_blocks;	/* Blocks count */
/*20*/	__u32	i_flags;	/* File flags */
	union {
		struct {
			__u32	l_i_version; /* was l_i_reserved1 */
		} linux1;
		struct {
			__u32  h_i_translator;
		} hurd1;
	} osd1;				/* OS dependent 1 */
/*28*/	__u32	i_block[EXT2_N_BLOCKS];/* Pointers to blocks */
/*64*/	__u32	i_generation;	/* File version (for NFS) */
	__u32	i_file_acl;	/* File ACL */
	__u32	i_size_high;
/*70*/	__u32	i_faddr;	/* Fragment address */
	union {
		struct {
			__u16	l_i_blocks_hi;
			__u16	l_i_file_acl_high;
			__u16	l_i_uid_high;	/* these 2 fields    */
			__u16	l_i_gid_high;	/* were reserved2[0] */
			__u16	l_i_checksum_lo; /* crc32c(uuid+inum+inode) */
			__u16	l_i_reserved;
		} linux2;
		struct {
			__u8	h_i_frag;	/* Fragment number */
			__u8	h_i_fsize;	/* Fragment size */
			__u16	h_i_mode_high;
			__u16	h_i_uid_high;
			__u16	h_i_gid_high;
			__u32	h_i_author;
		} hurd2;
	} osd2;				/* OS dependent 2 */
/*80*/	__u16	i_extra_isize;
	__u16	i_checksum_hi;	/* crc32c(uuid+inum+inode) */
	__u32	i_ctime_extra;	/* extra Change time (nsec << 2 | epoch) */
	__u32	i_mtime_extra;	/* extra Modification time (nsec << 2 | epoch) */
	__u32	i_atime_extra;	/* extra Access time (nsec << 2 | epoch) */
/*90*/	__u32	i_crtime;	/* File creation time */
	__u32	i_crtime_extra;	/* extra File creation time (nsec << 2 | epoch)*/
	__u32	i_version_hi;	/* high 32 bits for 64-bit version */
/*9c*/	__u32   i_projid;       /* Project ID */
};

Extents

The ext4 changed the block pointers into a system called an extent. These allow for 48-bit pointers rather than the old 32-bit pointers. An extent starts with an extent header, which tells us what type of extent it is (magic) and how we should interpret it.

Ext4 can still use the old i_block[] pointers. We check a flag in the inode (i_flags). The flag we’re looking for is EXT4_EXTENTS_FL (ext4 extents flag), which is the value 0x80000. If this flag is set, then i_block is interpreted as an extent header.

/*
 * each block (leaves and indexes), even inode-stored has header
 */
struct extent_header {
    __le16  eh_magic;      /* probably will support different formats */
    __le16  eh_entries;	   /* number of valid entries */
    __le16  eh_max;        /* capacity of store in entries */
    __le16  eh_depth;      /* has tree real underlying blocks? */
    __le32  eh_generation; /* generation of the tree */
};

The extent header is 12 bytes, so it covers i_block[0:2], which can just be reinterpreted from the i_block array.

constexpr struct extent_header *get_extent_header(__u32 blocks[]) {
    struct extent_header *ehdr = (struct extent_header *)blocks;
    return ehdr;
}

The only extent magic supported right now is 0xf30a, which is an ext3 extent. These are listed in a tree fashion. The nodes of the tree are called extend indices (extent_index):

/*
 * this is index on-disk structure
 * it's used at all the levels, but the bottom
 */
struct extent_index {
    __le32  ei_block;   /* index covers logical blocks from 'block' */
    __le32  ei_leaf;    /* pointer to the physical block of the next */
    __le16  ei_leaf_hi;	/* high 16 bits of physical block */
    __le16  ei_unused;  /* Not used */
};

The leaf nodes of the extent tree is called the extent, which as the following structure.

/*
 * this is extent on-disk structure
 * it's used at the bottom of the tree
 */
struct extent {
    __le32  ee_block;    /* first logical block extent covers */
    __le16  ee_len;      /* number of blocks covered by extent */
    __le16  ee_start_hi; /* high 16 bits of physical block */
    __le32  ee_start;    /* low 32 bigs of physical block */
};

The extent is also 12 bytes. The first field, ee_block, contains the logical block this extent covers. The ee_start is split into 16 bits and then 32 bits to make a 48-bit physical block pointer. The logical block is the block as it looks to this file, whereas the physical block is as it looks on the disk. For example, if ee_block is 3 and ee_start is 1234, then the fourth block (index #3) of the file can be found at block 1234 on the disk.

Extents are laid out in a tree fashion, as shown below from the academic paper on EXT4.

Extents System in Ext4
  • Extent header – describes the depth and number of entries in a subset.
  • Extent index – describes a tree branch. A branch tells us where to find extents or other branches.
  • Extent – describes a tree leaf. A leaf tells us the where to find the file data by pointing to the blocks.

Above shows an extent whose depth is > 0. If the depth is 0, then we don’t have an extent index, and instead, the extent structure follows the header. The depth numbers decrease as we get closer to the leaves, meaning that if we have root > branchA > branchB > leaf, then the depth at root will be 3, depth at branchA would be 2, depth at branchB would be 1, and the depth at the leaf would be 0. This tells us whether we are looking at an extent or extent index.

The inode itself might be the leaf. This is called an inode-stored extent. Like always, we can detect this by seeing the depth is 0 when reading the eh_depth of the header. For smaller files, all of the extents will be stored in the inode. The i_block is a field of \(15\times~4=60\) bytes. That means, that a total of \(60-12=48\div~12=4\) extents can fit in the inode itself, which follow directly after the extent header.