教你写一个linux 下的打包软件 tar

2020-02-22

教你写一个linux 下的打包软件 tar

公众号

相信你对 linux 的 .tar.gz 有点熟悉，这就是先 tar 打包（.tar 后缀），再对此 tar 文件用 gzip压缩(.tar.gz)的后缀名。

值得注意的是， tar不是压缩软件，它只做把一堆文件/文件夹打包到一个文件（tar 文件）里的事情，而文件联系，文件权限，相对的路径等都会给你保存好。

一开始设计是 tar 跟 gzip只做一件事情，各司其事，后来发现太麻烦了，于是就把压缩功能整合到 tar 里了。

- Create a gzipped archive:
    tar czf target.tar.gz file1 file2 file3

最近学习 OS 时写了一个类似tar的项目，那么今天就趁热打铁简单说一下如何写一个打包软件，这个软件会将重复的文件内容通过 md5 比较，复用旧的内容。

基本单位 block

block 可以理解为文件系统的最小单位，分别有以下类型：

directory block，文件夹 block，存储文件夹 meta 信息；
file block，文件 block，存储文件 meta 信息；
data block，只用来存文件内容；

Directory block，注意的是 entry 里要有 fileindex 来存储重复文件的 name 的下标。

同时，给项目一个 root dir。

typedef struct {
    char		name[SIFS_MAX_NAME_LENGTH]; // name of the directory
    time_t		modtime;	// time last modified <- time()

    uint32_t		nentries;// 文件夹内的文件/文件夹数量
    struct {
        SIFS_BLOCKID	blockID;	// subdirectory 或者 file 的 blockID
        uint32_t	fileindex;	// 重复文件的不同名字
    } entries[SIFS_MAX_ENTRIES];
} SIFS_DIRBLOCK;

文件 Block，length 就是有多少 bytes 的文件内容，之后用来算有多少个 data block，firstblockID记录第一个数据 block 的 id，nfiles 记录有多少重复内容的文件数量了，filenames 就是重复此文件 block 的文件内容的文件名字。

typedef struct {
    time_t		modtime;	// time first file added <- time()
    size_t		length;		// length of files' contents in bytes

    unsigned char	md5[MD5_BYTELEN];//the MD5 cryptographic digest (a summary) of the files' contents
    SIFS_BLOCKID	firstblockID;// the block number (blockID) of the files' first data-block

    uint32_t		nfiles;		// n files with identical contents
    char		filenames[SIFS_MAX_ENTRIES][SIFS_MAX_NAME_LENGTH];// an array of each same file's name and its modification time.
} SIFS_FILEBLOCK;

bitmaps数组，记录了每个 block 的类型，有：文件、文件夹以及data block 三种类型。

通用函数

就让大家看看关键函数好了：

读 tar 后的文件的 meta 头，记录了 block 的大小（ blocksize）以及多少个 blocks。

void read_vol_header(FILE *vol, SIFS_VOLUME_HEADER *header) {
    fread(header, sizeof(SIFS_VOLUME_HEADER), 1, vol);
    printf("header->blocksize %zu, header->nblocks %u\n", header->blocksize , header->nblocks);
}

bitmap，每次操作 tar 文件都要读的。

void read_bitmap(FILE *vol, SIFS_BIT *bitmap, int nblocks) {
    int size = nblocks * sizeof(SIFS_BIT);
    fread(bitmap, size, 1, vol);
}

root_block同理，读和写啥东西都要从 root block、root dir 出发。

void read_root_block(FILE *vol, SIFS_DIRBLOCK *dirblock){
    fread(dirblock, sizeof(SIFS_DIRBLOCK), 1, vol);
    printf("read_root_block finish, dirblock.name: %s, dirblock.entrieds: %d, dirblock.modtime %ld\n", dirblock->name, dirblock->nentries,dirblock->modtime);
}

路径嘛，你懂的，./sifs_put volumn ~/res.txt /dirB/subdirB/subsubdir/newfileB，要读的内容可以靠 read 函数解决，但是写到 tar 文件里的就要手动解析递归查路径了。

void read_route_names(char* pathname, char** route_names, int *route_cnt) {
    char *dir;
    char *pathname_to_split = copyStr(pathname);
    strcpy(pathname_to_split, pathname);
    while ((dir = strsep(&pathname_to_split, "/")) != NULL) {
        route_names[*route_cnt] = copyStr(dir);
        (*route_cnt)++;
    }
}

以上几乎是 mkdir，rmdir，writefile，readfile，putfile等等操作都要做的。

实现

然后，应该举一个 readfile 的例子就可以做代表了。

int recursive_dirinfo(SIFS_DIRBLOCK *cur_dir_block, char **route_names, int route_name_p, int route_cnt);

实现：

int recursive_dirinfo(SIFS_DIRBLOCK *cur_dir_block, char **route_names, int route_name_p, int route_cnt) {
    for(int i=0; i<cur_dir_block->nentries ; i++) {
        int blockid = cur_dir_block->entries[i].blockID;
        if(bitmap[blockid]==SIFS_DIR) {
            SIFS_DIRBLOCK dirblock;
            int start = sizeof(SIFS_VOLUME_HEADER) + header.nblocks*sizeof(SIFS_BIT);
            read_dir_block(vol, &dirblock, blockid * blocksize, start);
            if(strcmp(dirblock.name, route_names[route_name_p]) == 0) {
                if(route_name_p+2 == route_cnt) {
                    return do_read_file(cur_dir_block, route_names[route_name_p+1], blockid);
                }
                return recursive_dirinfo(&dirblock, route_names, route_name_p+1, route_cnt);
            }
        }
    }
    return 1;
}

以``./sifs_put volumn ~/res.txt /dirB/subdirB/subsubdir/newfileB为例子，如果递归找到 subsubdir`这个文件夹 block，进行相应操作：

写文件就往 bitmap 一直找没有用过的 block，够写文件就写进去，文件夹更新一下信息。
读文件就是根据此文件夹 block，找里面的 newfileB

int do_read_file(SIFS_DIRBLOCK *parent_dir, char *filename,  int parent_dir_block) {
    printf("do_find_file_info, filename %s\n", filename);
    for(int i=1; i<header.nblocks ; i++) {
        SIFS_FILEBLOCK fileblock;
        if(bitmap[i]==SIFS_FILE) {
            int start = sizeof(SIFS_VOLUME_HEADER) + header.nblocks*sizeof(SIFS_BIT);
            read_file_block(vol, &fileblock, i * blocksize, start);
            *nbytes = fileblock.length;
            int need_data_blocks = *nbytes / header.blocksize;
            if(strcmp(fileblock.filenames[0],  filename) == 0) {
                for(int d_block_id = fileblock.firstblockID; d_block_id - i -1 < need_data_blocks; d_block_id++) {
                    read_data_block(vol, (char*)(*data)+(d_block_id - i -1), blocksize, d_block_id * header.blocksize, start);
                }
                return 0;
            }

        }
    }
    return 1;
}

而真实的 tar 自然更复杂，还要记录用户权限、用户、group文件等等：

struct posix_header
{                       /* byte offset */
  char name[100];       /*   0 */   文件名
  char mode[8];         /* 100 */   用户权限
  char uid[8];          /* 108 */   user id
  char gid[8];          /* 116 */   group id
  char size[12];        /* 124 */   文件大小
  char mtime[12];       /* 136 */   修改时间
  char chksum[8];       /* 148 */   校验值
  char typeflag;        /* 156 */   文件类型标志
  char linkname[100];   /* 157 */   符号链接指向
  char magic[6];        /* 257 */   
  char version[2];      /* 263 */
  char uname[32];       /* 265 */   user name
  char gname[32];       /* 297 */   group name
  char devmajor[8];     /* 329 */   设备文件 major
  char devminor[8];     /* 337 */   设备文件 minor
  char prefix[155];     /* 345 */
                        /* 500 */
};

文件类型标志定义，包含了所有 Unix 系统中的文件类型

#define REGTYPE  '0'            /* regular file */
#define LNKTYPE  '1'            /* link */
#define SYMTYPE  '2'            /* reserved */
#define CHRTYPE  '3'            /* character special */
#define BLKTYPE  '4'            /* block special */
#define DIRTYPE  '5'            /* directory */
#define FIFOTYPE '6'            /* FIFO special */
#define CONTTYPE '7'            /* reserved */

概览如此，写起来其实有点烦 - = -，有兴趣的读者可以写写。

For The Best Thing In The World

Work Hard to Enjoy Them.