GIT初始版源码解析

一、背景

一方面想对git底层工作原理有更多理解，另外观摩下Linus大神的编码思路和风格

二、版本选择

git项目地址：https://github.com/git/git.git

开源库随着功能增加代码越来越庞大，并且主干思想也会被越来越多的分支细节所淹没，所以直接选择git第一个版本代码进行参考。时间回到2005年4月7号的下午。

commit e83c5163316f89bfbde7d9ab23ca2e25604af290 (HEAD)
Author: Linus Torvalds 
Date:   Thu Apr 7 15:13:13 2005 -0700

进到目录里看一下，只有几个文件，总共代码行才1000出头

三、代码运行

review+debug是学习代码库非常有效的方法，所以先让代码跑起来。mac上尝试编译，出现一些警告以及错误。本地做一些修改后编译通过。有同学要自己尝试动手，可以参照以下修改：

1、安装openssl库以及zlib库

brew install openssl

brew install zlib

2、修改编译以及链接选项并指定头文件以及库位置同时关闭弃用函数报警

CFLAGS=-g -Wno-deprecated -I/usr/local/opt/openssl/include/

LDFLAGS=-L/usr/local/opt/openssl/lib/ -L/usr/local/opt/zlib/lib/

3、链接库修改，从 -lssl 改为 -lcrypto -lz

4、main函数增加返回值、修改时间相关结构体

5、m1芯片mac没有找到可用gdb版本，可以使用lldb代替

以下是具体修改点：

[graypig:]$ git diff
diff --git a/Makefile b/Makefile
index a6bba79ba1..fe779bdb75 100644
--- a/Makefile
+++ b/Makefile
@@ -1,4 +1,5 @@
-CFLAGS=-g
+CFLAGS=-g -Wno-deprecated -I/usr/local/opt/openssl/include/
+LDFLAGS=-L/usr/local/opt/openssl/lib/ -L/usr/local/opt/zlib/lib/
 CC=gcc
 
 PROG=update-cache show-diff init-db write-tree read-tree commit-tree cat-file
@@ -8,27 +9,27 @@ all: $(PROG)
 install: $(PROG)
        install $(PROG) $(HOME)/bin/
 
-LIBS= -lssl
+LIBS= -lcrypto -lz
 
 init-db: init-db.o
 
 update-cache: update-cache.o read-cache.o
-       $(CC) $(CFLAGS) -o update-cache update-cache.o read-cache.o $(LIBS)
+       $(CC) $(CFLAGS) $(LDFLAGS) -o update-cache update-cache.o read-cache.o $(LIBS)

 show-diff: show-diff.o read-cache.o
-       $(CC) $(CFLAGS) -o show-diff show-diff.o read-cache.o $(LIBS)
+       $(CC) $(CFLAGS) $(LDFLAGS) -o show-diff show-diff.o read-cache.o $(LIBS)
 
 write-tree: write-tree.o read-cache.o
-       $(CC) $(CFLAGS) -o write-tree write-tree.o read-cache.o $(LIBS)
+       $(CC) $(CFLAGS) $(LDFLAGS) -o write-tree write-tree.o read-cache.o $(LIBS)
 
 read-tree: read-tree.o read-cache.o
-       $(CC) $(CFLAGS) -o read-tree read-tree.o read-cache.o $(LIBS)
+       $(CC) $(CFLAGS) $(LDFLAGS) -o read-tree read-tree.o read-cache.o $(LIBS)
 
 commit-tree: commit-tree.o read-cache.o
-       $(CC) $(CFLAGS) -o commit-tree commit-tree.o read-cache.o $(LIBS)
+       $(CC) $(CFLAGS) $(LDFLAGS) -o commit-tree commit-tree.o read-cache.o $(LIBS)
 
 cat-file: cat-file.o read-cache.o
-       $(CC) $(CFLAGS) -o cat-file cat-file.o read-cache.o $(LIBS)
+       $(CC) $(CFLAGS) $(LDFLAGS) -o cat-file cat-file.o read-cache.o $(LIBS)
 
 read-cache.o: cache.h
 show-diff.o: cache.h
diff --git a/cache.h b/cache.h
index 98a32a9ad3..161a5aff90 100644
--- a/cache.h
+++ b/cache.h
@@ -12,6 +12,7 @@
 
 #include 
 #include 
+#include 
 
 /*
  * Basic data structures for the directory cache
diff --git a/init-db.c b/init-db.c
index 25dc13fe10..d11b16bff5 100644
--- a/init-db.c
+++ b/init-db.c
@@ -19,8 +19,8 @@ int main(int argc, char **argv)
        sha1_dir = getenv(DB_ENVIRONMENT);
        if (sha1_dir) {
                struct stat st;
-               if (!stat(sha1_dir, &st) < 0 && S_ISDIR(st.st_mode))
-                       return;
+               if (!(stat(sha1_dir, &st) < 0) && S_ISDIR(st.st_mode))
+                       return 0;
                fprintf(stderr, "DB_ENVIRONMENT set to bad directory %s: ", sha1_dir);
        }
 
diff --git a/show-diff.c b/show-diff.c
index b8522886a1..6d00ba2a6f 100644
--- a/show-diff.c
+++ b/show-diff.c
@@ -11,11 +11,11 @@ static int match_stat(struct cache_entry *ce, struct stat *st)
 {
        unsigned int changed = 0;
 
-       if (ce->mtime.sec  != (unsigned int)st->st_mtim.tv_sec ||
-           ce->mtime.nsec != (unsigned int)st->st_mtim.tv_nsec)
+       if (ce->mtime.sec  != (unsigned int)st->st_mtimespec.tv_sec ||
+           ce->mtime.nsec != (unsigned int)st->st_mtimespec.tv_nsec)
                changed |= MTIME_CHANGED;
-       if (ce->ctime.sec  != (unsigned int)st->st_ctim.tv_sec ||
-           ce->ctime.nsec != (unsigned int)st->st_ctim.tv_nsec)
+       if (ce->ctime.sec  != (unsigned int)st->st_ctimespec.tv_sec ||
+           ce->ctime.nsec != (unsigned int)st->st_ctimespec.tv_nsec)
                changed |= CTIME_CHANGED;
        if (ce->st_uid != (unsigned int)st->st_uid ||
            ce->st_gid != (unsigned int)st->st_gid)
diff --git a/update-cache.c b/update-cache.c
index 5085a5cb53..f9c8e0fc69 100644
--- a/update-cache.c
+++ b/update-cache.c
@@ -139,9 +139,9 @@ static int add_file_to_cache(char *path)
        memset(ce, 0, size);
        memcpy(ce->name, path, namelen);
        ce->ctime.sec = st.st_ctime;
-       ce->ctime.nsec = st.st_ctim.tv_nsec;
+       ce->ctime.nsec = st.st_ctimespec.tv_nsec;
        ce->mtime.sec = st.st_mtime;
-       ce->mtime.nsec = st.st_mtim.tv_nsec;
+       ce->mtime.nsec = st.st_mtimespec.tv_nsec;
        ce->st_dev = st.st_dev;
        ce->st_ino = st.st_ino;
        ce->st_mode = st.st_mode;

四、源码分析

1、 init-db.c

核心逻辑：创建缓存目录.dircache/objects，并且在此目录下预创建256个目录，命名规则

.dircache/objects/00 .dircache/objects/01 .dircache/objects/... .dircache/objects/ff

#include "cache.h"

int main(int argc, char **argv)
{
    char *sha1_dir = getenv(DB_ENVIRONMENT), *path;
    int len, i, fd;

    if (mkdir(".dircache", 0700) < 0) {
        perror("unable to create .dircache");
        exit(1);
    }

    /*
     * If you want to, you can share the DB area with any number of branches.
     * That has advantages: you can save space by sharing all the SHA1 objects.
     * On the other hand, it might just make lookup slower and messier. You
     * be the judge.
     */
    sha1_dir = getenv(DB_ENVIRONMENT);
    if (sha1_dir) {
        struct stat st;
        if (!stat(sha1_dir, &st) < 0 && S_ISDIR(st.st_mode))
            return;
        fprintf(stderr, "DB_ENVIRONMENT set to bad directory %s: ", sha1_dir);
    }

    /*
     * The default case is to have a DB per managed directory. 
     */
    sha1_dir = DEFAULT_DB_ENVIRONMENT;
    fprintf(stderr, "defaulting to private storage area
");
    len = strlen(sha1_dir);
    if (mkdir(sha1_dir, 0700) < 0) {
        if (errno != EEXIST) {
            perror(sha1_dir);
            exit(1);
        }
    }
    //注意malloc申请内存后不会清零，但是使用sprintf格式化会在末尾添加，所以不存在越界问题
    path = malloc(len + 40);
    memcpy(path, sha1_dir, len);
    for (i = 0; i < 256; i++) {
        //两个16进制字符格式打印
        sprintf(path+len, "/%02x", i);
        if (mkdir(path, 0700) < 0) {
            if (errno != EEXIST) {
                perror(path);
                exit(1);
           }
        }
    }
    return 0;
}

2、update-cache.c

缓存项设计经过仔细考量，可以直接利用文件字节流还原内存缓存项结构，省掉了拷贝动作。核心逻辑：

首先读取.dircache/index的文件内容，对要加入缓存的文件进行校验后，进行zlib压缩并计算sha1值，按照sha1计算文件的路径.dircache/objects/xx/xx{19}，保存文件然后更新全局cache信息，并将全局cache保存到磁盘上生成新的.dircache/index。

文件内容索引文件格式："blob " + size + null + zlib压缩后的文件内容。

int main(int argc, char **argv)
{
    int i, newfd, entries;

    entries = read_cache();
    if (entries < 0) {
        perror("cache corrupted");
        return -1;
    }

    newfd = open(".dircache/index.lock", O_RDWR | O_CREAT | O_EXCL, 0600);
    if (newfd < 0) {
        perror("unable to create new cachefile");
        return -1;
    }
    for (i = 1 ; i < argc; i++) {
        char *path = argv[i];
        // 判断路径是否合法，排除: . .. //结尾
        if (!verify_path(path)) {
            fprintf(stderr, "Ignoring path %s
", argv[i]);
              continue;
        }
        if (add_file_to_cache(path)) {
            fprintf(stderr, "Unable to add %s to database
", path);
            goto out;
        }
    }
    if (!write_cache(newfd, active_cache, active_nr) && !rename(".dircache/index.lock", ".dircache/index"))
        return 0;
out:
    unlink(".dircache/index.lock");
}

2.1 缓存读取逻辑

read_cache读取缓存逻辑：打开缓存文件.dircache/index，通过mmap将文件映射到内存，校验文件sha1，根据头enty个数还原缓存数据。

   int read_cache(void)
{
    int fd, i;
    struct stat st;
    unsigned long size, offset;
    void *map;
    struct cache_header *hdr;

    errno = EBUSY;
    if (active_cache)
        return error("more than one cachefile");
    errno = ENOENT;
    sha1_file_directory = getenv(DB_ENVIRONMENT);
    if (!sha1_file_directory)
        sha1_file_directory = DEFAULT_DB_ENVIRONMENT;
    if (access(sha1_file_directory, X_OK) < 0)
        return error("no access to SHA1 file directory");
    fd = open(".dircache/index", O_RDONLY);
    if (fd < 0)
        return (errno == ENOENT) ? 0 : error("open failed");

    map = (void *)-1;
    if (!fstat(fd, &st)) {
        map = NULL;
        size = st.st_size;
        errno = EINVAL;
        if (size > sizeof(struct cache_header))
            map = mmap(NULL, size, PROT_READ, MAP_PRIVATE, fd, 0);
    }
    close(fd);
    if (-1 == (int)(long)map)
        return error("mmap failed");

    hdr = map;
    if (verify_hdr(hdr, size) < 0)
        goto unmap;

    // 根据缓存数量来申请内存，预留1.5倍空间
    active_nr = hdr->entries;
    active_alloc = alloc_nr(active_nr);
    active_cache = calloc(active_alloc, sizeof(struct cache_entry *));
    // 通过文件字节直接还原内存结构
    offset = sizeof(*hdr);
    for (i = 0; i < hdr->entries; i++) {
        struct cache_entry *ce = map + offset;
        offset = offset + ce_size(ce);
        active_cache[i] = ce;
    }
    return active_nr;

unmap:
    munmap(map, size);
    errno = EINVAL;
    return error("verify header failed");
}

verify_hdr校验缓存头：通过缓存重新计算sha1，跟缓存头sha1对比进行校验

static int verify_hdr(struct cache_header *hdr, unsigned long size)
{
   SHA_CTX c;
   unsigned char sha1[20];
   // 基础校验, 签名&版本
   if (hdr->signature != CACHE_SIGNATURE)
      return error("bad signature");
   if (hdr->version != 1)
      return error("bad version");
   SHA1_Init(&c);
   // 提取缓存头中除了sha1部分的数据
   SHA1_Update(&c, hdr, offsetof(struct cache_header, sha1));
   // 提取缓存内容数据，hdr+1是指跳过缓存头
   SHA1_Update(&c, hdr+1, size - sizeof(*hdr));
   // 计算sha1
   SHA1_Final(sha1, &c);
   // 对比sha1
   if (memcmp(sha1, hdr->sha1, 20))
      return error("bad header sha1");
   return 0;
}

特殊宏函数说明：

struct cache_entry {
   struct cache_time ctime;
   struct cache_time mtime;
   unsigned int st_dev;
   unsigned int st_ino;
   unsigned int st_mode;
   unsigned int st_uid;
   unsigned int st_gid;
   unsigned int st_size;
   unsigned char sha1[20];
   unsigned short namelen;
   // 0长度字符数组，并不占用空间
   unsigned char name[0];
};

// 计算缓存项的长度
#define ce_size(ce) cache_entry_size((ce)->namelen)

/*
 offsetof(struct cache_entry,name)获取name在结构体中的偏移，即除去name之外的缓存项目大小
  & ~7 将最低3位置0，也就是说将最终的长度对8对齐
  +8  为了防止将最低3位置0后大小变小，因此提前+8来预留空间
  
 */
#define cache_entry_size(len) ((offsetof(struct cache_entry,name) + (len) + 8) & ~7)

2.2 文件加入缓存逻辑

获取文件meta信息，给文件建立索引，将文件加入缓存entry.


static int add_file_to_cache(char *path)
{
   int size, namelen;
   struct cache_entry *ce;
   struct stat st;
   int fd;

   fd = open(path, O_RDONLY);
   if (fd < 0) {
      if (errno == ENOENT)
         return remove_file_from_cache(path);
      return -1;
   }
   if (fstat(fd, &st) < 0) {
      close(fd);
      return -1;
   }
   namelen = strlen(path);
   size = cache_entry_size(namelen);
   ce = malloc(size);
   memset(ce, 0, size);
   memcpy(ce->name, path, namelen);
   ce->ctime.sec = st.st_ctime;
   ce->ctime.nsec = st.st_ctimespec.tv_nsec;
   ce->mtime.sec = st.st_mtime;
   ce->mtime.nsec = st.st_mtimespec.tv_nsec;
   ce->st_dev = st.st_dev;
   ce->st_ino = st.st_ino;
   ce->st_mode = st.st_mode;
   ce->st_uid = st.st_uid;
   ce->st_gid = st.st_gid;
   ce->st_size = st.st_size;
   ce->namelen = namelen;

   if (index_fd(path, namelen, ce, fd, &st) < 0)
      return -1;

   return add_cache_entry(ce);
}

文件建索引流程：将文件mmap到内存，使用zlib压缩meta信息(blob size + null byte)，压缩文件内容，计算sha1，根据sha1计算缓存文件名，写入缓存文件。


static int index_fd(const char *path, int namelen, struct cache_entry *ce, int fd, struct stat *st)
{
   z_stream stream;
   int max_out_bytes = namelen + st->st_size + 200;
   void *out = malloc(max_out_bytes);
   void *metadata = malloc(namelen + 200);
   void *in = mmap(NULL, st->st_size, PROT_READ, MAP_PRIVATE, fd, 0);
   SHA_CTX c;

   close(fd);
   if (!out || (int)(long)in == -1)
      return -1;

   memset(&stream, 0, sizeof(stream));
   deflateInit(&stream, Z_BEST_COMPRESSION);

   // 压缩meta信息
   /*
    * ASCII size + nul byte
    */    
   stream.next_in = metadata;
   stream.avail_in = 1+sprintf(metadata, "blob %lu", (unsigned long) st->st_size);
   stream.next_out = out;
   stream.avail_out = max_out_bytes;
   while (deflate(&stream, 0) == Z_OK)
      /* nothing */;

   /*
    * File content
    */
   // 压缩文件内容
   stream.next_in = in;
   stream.avail_in = st->st_size;
   while (deflate(&stream, Z_FINISH) == Z_OK)
      /*nothing */;

   deflateEnd(&stream);
   
   SHA1_Init(&c);
   SHA1_Update(&c, out, stream.total_out);
   // 计算sha1
   SHA1_Final(ce->sha1, &c);

   // 文件内容写入缓存
   return write_sha1_buffer(ce->sha1, out, stream.total_out);
}

int write_sha1_buffer(unsigned char *sha1, void *buf, unsigned int size)
{
   char *filename = sha1_file_name(sha1);
   int i, fd;

   fd = open(filename, O_WRONLY | O_CREAT | O_EXCL, 0666);
   if (fd < 0)
      return (errno == EEXIST) ? 0 : -1;
   write(fd, buf, size);
   close(fd);
   return 0;
}

根据哈希值计算文件名: 第一个哈希值决定目录，剩余的19个哈希值决定文件名

/*
 * NOTE! This returns a statically allocated buffer, so you have to be
 * careful about using it. Do a "strdup()" if you need to save the
 * filename.
 */
char *sha1_file_name(unsigned char *sha1)
{
   int i;
   static char *name, *base;

   if (!base) {
      char *sha1_file_directory = getenv(DB_ENVIRONMENT) ? : DEFAULT_DB_ENVIRONMENT;
      int len = strlen(sha1_file_directory);
      base = malloc(len + 60);
      memcpy(base, sha1_file_directory, len);
      memset(base+len, 0, 60);
      // .dircache/objects/xx/xx{19}
      base[len] = '/';
      base[len+3] = '/';
      name = base + len + 1;
   }
   for (i = 0; i < 20; i++) {
      static char hex[] = "0123456789abcdef";
      unsigned int val = sha1[i];
      //根据哈希值计算文件名。第一个哈希值决定目录，剩余的19个哈希值决定文件名
      // i > 0是用来跳过"/"， 第一个哈希值在"/"前，剩余的19个哈希值在"/"后
      char *pos = name + i*2 + (i > 0);
      *pos++ = hex[val >> 4];
      *pos = hex[val & 0xf];
   }
   return base;
}

add_cache_entry将文件加入缓存: 缓存按照文件路径排序，二分查找。


static int add_cache_entry(struct cache_entry *ce)
{
   int pos;

   pos = cache_name_pos(ce->name, ce->namelen);

   /* existing match? Just replace it */
   if (pos < 0) {
      active_cache[-pos-1] = ce;
      return 0;
   }

   /* Make sure the array is big enough .. */
   if (active_nr == active_alloc) {
      active_alloc = alloc_nr(active_alloc);
      active_cache = realloc(active_cache, active_alloc * sizeof(struct cache_entry *));
   }

   /* Add it in.. */
   active_nr++;
   // 要插入的位置不在最后，从pos开始元素向后移动
   if (active_nr > pos)
      memmove(active_cache + pos + 1, active_cache + pos, (active_nr - pos - 1) * sizeof(ce));
   active_cache[pos] = ce;
   return 0;
}

cache_name_pos根据名字获取缓存项的位置，二分查找。这个函数返回值比较特殊，没有找到返回最后一次查找first(>0)，找到了返回 -p -1(<0) 。这样设计，基于性能考虑，找到时返回了位置，在没有找到的时候返回了要插入的位置。

 static int cache_name_pos(const char *name, int namelen)
{
   int first, last;

   first = 0;
   last = active_nr;
   while (last > first) {
      int next = (last + first) >> 1;
      struct cache_entry *ce = active_cache[next];
      int cmp = cache_name_compare(name, namelen, ce->name, ce->namelen);
      if (!cmp)
         return -next-1;
      if (cmp < 0) {
         last = next;
         continue;
      }
      first = next+1;
   }
   return first;
}

cache_name_compare，先比较名称，再比较长度， 0相等，-1 小于，1 大于

static int cache_name_compare(const char *name1, int len1, const char *name2, int len2)
{
   int len = len1 < len2 ? len1 : len2;
   int cmp;

   cmp = memcmp(name1, name2, len);
   if (cmp)
      return cmp;
   if (len1 < len2)
      return -1;
   if (len1 > len2)
      return 1;
   return 0;
}

3、show-diff.c

核心逻辑：首先读取缓存，针对缓存中的每个entry，根据meta判断文件当前是否有变更，如果有打印文件路径以及sha1，并且根据sha1找到文件并解压文件内容，并调用系统的diff(diff -u - ${name})命令打印差异。对diff命令，-代表标准输入


int main(int argc, char **argv)
{
   int entries = read_cache();
   int i;

   if (entries < 0) {
      perror("read_cache");
      exit(1);
   }
   for (i = 0; i < entries; i++) {
      struct stat st;
      struct cache_entry *ce = active_cache[i];
      int n, changed;
      unsigned int mode;
      unsigned long size;
      char type[20];
      void *new;

      if (stat(ce->name, &st) < 0) {
         printf("%s: %s
", ce->name, strerror(errno));
         continue;
      }
      changed = match_stat(ce, &st);
      if (!changed) {
         printf("%s: ok
", ce->name);
         continue;
      }
      printf("%.*s:  ", ce->namelen, ce->name);
      for (n = 0; n < 20; n++)
         printf("%02x", ce->sha1[n]);
      printf("
");
      new = read_sha1_file(ce->sha1, type, &size);
      show_differences(ce, &st, new, size);
      free(new);
   }
   return 0;
}

show_differences 执行系统命令diff打印差异

static void show_differences(struct cache_entry *ce, struct stat *cur,
   void *old_contents, unsigned long long old_size)
{
   static char cmd[1000];
   FILE *f;

   snprintf(cmd, sizeof(cmd), "diff -u - %s", ce->name);
   f = popen(cmd, "w");
   fwrite(old_contents, old_size, 1, f);
   pclose(f);
}

4、cat-file.c

核心逻辑：按照sha1计算缓存文件名，读取文件解压将内容写入临时文件，并且打印类型以及长度

int main(int argc, char **argv)
{
   unsigned char sha1[20];
   char type[20];
   void *buf;
   unsigned long size;
   char template[] = "temp_git_file_XXXXXX";
   int fd;

   if (argc != 2 || get_sha1_hex(argv[1], sha1))
      usage("cat-file: cat-file ");
   buf = read_sha1_file(sha1, type, &size);
   if (!buf)
      exit(1);
   fd = mkstemp(template);
   if (fd < 0)
      usage("unable to create tempfile");
   if (write(fd, buf, size) != size)
      strcpy(type, "bad");
   printf("%s: %s
", template, type);
}

5、write-tree.c

核心逻辑：读取文件缓存数据，组成树内容。内容格式：tree size null mode name null [mode name null] mode name null。然后根据文件内容计算sha1，根据sha1计算文件路径，将压缩后的数据写入文件


int main(int argc, char **argv)
{
   unsigned long size, offset, val;
   int i, entries = read_cache();
   char *buffer;

   if (entries <= 0) {
      fprintf(stderr, "No file-cache to create a tree of
");
      exit(1);
   }

   /* Guess at an initial size */
   size = entries * 40 + 400;
   buffer = malloc(size);
   offset = ORIG_OFFSET;

   for (i = 0; i < entries; i++) {
      struct cache_entry *ce = active_cache[i];
      if (check_valid_sha1(ce->sha1) < 0)
         exit(1);
      // 空间不够重新申请
      if (offset + ce->namelen + 60 > size) {
         size = alloc_nr(offset + ce->namelen + 60);
         buffer = realloc(buffer, size);
      }
      // 格式：十进制权限 文件名 NULL sha1
      offset += sprintf(buffer + offset, "%o %s", ce->st_mode, ce->name);
      buffer[offset++] = 0;
      memcpy(buffer + offset, ce->sha1, 20);
      offset += 20;
   }
   /*
      offset - ORIG_OFFSET 数据长度
      ORIG_OFFSET 数据偏移
      将数据长度写到预留空间的尾部，向前填入"tree "，并调整buffer offset位置
      整体数据格式： tree size null mode name null sha1 [mode name null sha1] ... mode name null sha1
   */
   i = prepend_integer(buffer, offset - ORIG_OFFSET, ORIG_OFFSET);
   i -= 5;
   memcpy(buffer+i, "tree ", 5);

   buffer += i;
   offset -= i;

   write_sha1_file(buffer, offset);
   return 0;
}

prepend_integer 从i个位置向前以字符串形式填写val，并返回新的i

static int prepend_integer(char *buffer, unsigned val, int i)
{
   buffer[--i] = '';
   do {
      buffer[--i] = '0' + (val % 10);
      val /= 10;
   } while (val);
   return i;
}

数据样例：

x buf x/60b buf

https://wenku.baidu.com/view/62a4aea6e63a580216fc700abb68a98271feacb0.html?_wkts_=1676432746759&bdQuery=lldb+%E8%BF%9E%E7%BB%AD%E5%86%85%E5%AD%98

6、commit-tree.c

基础逻辑：校验参数后，获取当前登录用户的密码相关信息，用来获取用户名、email，记录changgelog。记录当前commit sha1，parent sha1 、author 、committer以及评论信息，调整缓存头"commit size." 根据文件内容sha1计算文件名，并保存到object目录。


int main(int argc, char **argv)
{
   int i, len;
   int parents = 0;
   unsigned char tree_sha1[20];
   unsigned char parent_sha1[MAXPARENT][20];
   char *gecos, *realgecos;
   char *email, realemail[1000];
   char *date, *realdate;
   char comment[1000];
   struct passwd *pw;
   time_t now;
   char *buffer;
   unsigned int size;

   if (argc < 2 || get_sha1_hex(argv[1], tree_sha1) < 0)
      usage("commit-tree  [-p ]* < changelog");

   for (i = 2; i < argc; i += 2) {
      char *a, *b;
      a = argv[i]; b = argv[i+1];
      if (!b || strcmp(a, "-p") || get_sha1_hex(b, parent_sha1[parents]))
         usage("commit-tree  [-p ]* < changelog");
      parents++;
   }
   if (!parents)
      fprintf(stderr, "Committing initial tree %s
", argv[1]);
   // 读取当前用户密码信息，用来记录changelog
   pw = getpwuid(getuid());
   if (!pw)
      usage("You don't exist. Go away!");
   realgecos = pw->pw_gecos;
   len = strlen(pw->pw_name);
   memcpy(realemail, pw->pw_name, len);
   realemail[len] = '@';
   gethostname(realemail+len+1, sizeof(realemail)-len-1);
   time(&now);
   realdate = ctime(&now);

   gecos = getenv("COMMITTER_NAME") ? : realgecos;
   email = getenv("COMMITTER_EMAIL") ? : realemail;
   date = getenv("COMMITTER_DATE") ? : realdate;

   remove_special(gecos); remove_special(realgecos);
   remove_special(email); remove_special(realemail);
   remove_special(date); remove_special(realdate);

   init_buffer(&buffer, &size);
   add_buffer(&buffer, &size, "tree %s
", sha1_to_hex(tree_sha1));

   /*
    * NOTE! This ordering means that the same exact tree merged with a
    * different order of parents will be a _different_ changeset even
    * if everything else stays the same.
    */
   for (i = 0; i < parents; i++)
      add_buffer(&buffer, &size, "parent %s
", sha1_to_hex(parent_sha1[i]));

   /* Person/date information */
   add_buffer(&buffer, &size, "author %s <%s> %s
", gecos, email, date);
   add_buffer(&buffer, &size, "committer %s <%s> %s

", realgecos, realemail, realdate);

   /* And add the comment */
   while (fgets(comment, sizeof(comment), stdin) != NULL)
      add_buffer(&buffer, &size, "%s", comment);

   finish_buffer("commit ", &buffer, &size);

   write_sha1_file(buffer, size);
   return 0;
}

缓存处理逻辑：初始化了16K基本缓存大小，预留了40字节头信息，每32k realloc一次内存。代码存在BUG，应该是笔误，16k 32k应该设置成一样大小，否则特殊场景会崩。


#define BLOCKING (1ul << 14)
#define ORIG_OFFSET (40)

/*
 * Leave space at the beginning to insert the tag
 * once we know how big things are.
 *
 * FIXME! Share the code with "write-tree.c"
 */
static void init_buffer(char **bufp, unsigned int *sizep)
{
   char *buf = malloc(BLOCKING);
   memset(buf, 0, ORIG_OFFSET);
   *sizep = ORIG_OFFSET;
   *bufp = buf;
}

static void add_buffer(char **bufp, unsigned int *sizep, const char *fmt, ...)
{
   char one_line[2048];
   va_list args;
   int len;
   unsigned long alloc, size, newsize;
   char *buf;

   va_start(args, fmt);
   len = vsnprintf(one_line, sizeof(one_line), fmt, args);
   va_end(args);
   size = *sizep;
   newsize = size + len;
   alloc = (size + 32767) & ~32767;
   buf = *bufp;
   if (newsize > alloc) {
      alloc = (newsize + 32767) & ~32767;   
      buf = realloc(buf, alloc);
      *bufp = buf;
   }
   *sizep = newsize;
   memcpy(buf + size, one_line, len);
}

五、总结

设计巧妙，代码简洁工整，注重性能，注释自由

1、基础模型

git里两个基本概念：The Object Database、Current Directory Cache

The Object Database：

对象数据库，对象内容采用zlib压缩，对象名采用sha1，包含三类对象，BLOB(普通文件内容)、TREE(文件权限/名称/sha1集合，表示一次提交的内容)、

CHANGESET(TREE父子链，表示变更历史)。

Current Directory Cache：

git暂存区，当前缓存的文件的META信息

2、功能维度

git第一版代码保持了linux工具链风格，每个工具只干一件事情，底层工具组合在一起完成代码管理功能

1）update-cache：git add雏形，保存最新文件内容到objects里，并更新本地目录缓存

2）show-diff：git status雏形，实现了缓存中的文件与最新状态差异对比

3）write-tree： git commit雏形1，保存工作区最新缓存树到objects目录并生成sha1

4）commit-tree: git commit雏形2，保存提交的树的sha1以及的parent树的sha1到object目录并生成sha1

理解了这些工具实现逻辑，不难想象目前git的各种命令和概念的原理。比如分支，分支本质只是一个changeset的sha1，基于sha1可以反向追溯每一次提交的tree。要实现两次提交diff，对比两个tree可以找到目录差异以及变化的文件，基于文件的sha1可以找到文件进而对比出文件的变化。分支拷贝，底层操作只需要拷贝一个sha1值，等等。

3、性能维度

实现功能同时充分考量性能，缓存项头格式设计、二分查找返回值的设计、文件内容头信息、文件访问采用mmap避免内核缓冲区到用户缓冲区数据拷贝

虽然对常规业务来讲，可读性高于性能，但随手可得的优化是程序员基本素养

展开阅读全文

页面更新：2024-03-27

标签：雏形缓存源码长度逻辑位置代码文件目录内容数据

1 2 3 4 5