Initial revision of "git", the information manager from hell

neerajsi-msft · Apr 7, 2005 · e83c516 · e83c516
commit e83c516
Show file tree

Hide file tree

Showing 11 changed files with 1,244 additions and 0 deletions.
diff --git a/Makefile b/Makefile
@@ -0,0 +1,40 @@
+CFLAGS=-g
+CC=gcc
+
+PROG=update-cache show-diff init-db write-tree read-tree commit-tree cat-file
+
+all: $(PROG)
+
+install: $(PROG)
+	install $(PROG) $(HOME)/bin/
+
+LIBS= -lssl
+
+init-db: init-db.o
+
+update-cache: update-cache.o read-cache.o
+	$(CC) $(CFLAGS) -o update-cache update-cache.o read-cache.o $(LIBS)
+
+show-diff: show-diff.o read-cache.o
+	$(CC) $(CFLAGS) -o show-diff show-diff.o read-cache.o $(LIBS)
+
+write-tree: write-tree.o read-cache.o
+	$(CC) $(CFLAGS) -o write-tree write-tree.o read-cache.o $(LIBS)
+
+read-tree: read-tree.o read-cache.o
+	$(CC) $(CFLAGS) -o read-tree read-tree.o read-cache.o $(LIBS)
+
+commit-tree: commit-tree.o read-cache.o
+	$(CC) $(CFLAGS) -o commit-tree commit-tree.o read-cache.o $(LIBS)
+
+cat-file: cat-file.o read-cache.o
+	$(CC) $(CFLAGS) -o cat-file cat-file.o read-cache.o $(LIBS)
+
+read-cache.o: cache.h
+show-diff.o: cache.h
+
+clean:
+	rm -f *.o $(PROG) temp_git_file_*
+
+backup: clean
+	cd .. ; tar czvf dircache.tar.gz dir-cache
diff --git a/README b/README
@@ -0,0 +1,168 @@
+
+	GIT - the stupid content tracker
+
+"git" can mean anything, depending on your mood.
+
+ - random three-letter combination that is pronounceable, and not
+   actually used by any common UNIX command.  The fact that it is a
+   mispronounciation of "get" may or may not be relevant.
+ - stupid. contemptible and despicable. simple. Take your pick from the
+   dictionary of slang.
+ - "global information tracker": you're in a good mood, and it actually
+   works for you. Angels sing, and a light suddenly fills the room. 
+ - "goddamn idiotic truckload of sh*t": when it breaks
+
+This is a stupid (but extremely fast) directory content manager.  It
+doesn't do a whole lot, but what it _does_ do is track directory
+contents efficiently. 
+
+There are two object abstractions: the "object database", and the
+"current directory cache".
+
+	The Object Database (SHA1_FILE_DIRECTORY)
+
+The object database is literally just a content-addressable collection
+of objects.  All objects are named by their content, which is
+approximated by the SHA1 hash of the object itself.  Objects may refer
+to other objects (by referencing their SHA1 hash), and so you can build
+up a hierarchy of objects. 
+
+There are several kinds of objects in the content-addressable collection
+database.  They are all in deflated with zlib, and start off with a tag
+of their type, and size information about the data.  The SHA1 hash is
+always the hash of the _compressed_ object, not the original one.
+
+In particular, the consistency of an object can always be tested
+independently of the contents or the type of the object: all objects can
+be validated by verifying that (a) their hashes match the content of the
+file and (b) the object successfully inflates to a stream of bytes that
+forms a sequence of <ascii tag without space> + <space> + <ascii decimal
+size> + <byte\0> + <binary object data>. 
+
+BLOB: A "blob" object is nothing but a binary blob of data, and doesn't
+refer to anything else.  There is no signature or any other verification
+of the data, so while the object is consistent (it _is_ indexed by its
+sha1 hash, so the data itself is certainly correct), it has absolutely
+no other attributes.  No name associations, no permissions.  It is
+purely a blob of data (ie normally "file contents"). 
+
+TREE: The next hierarchical object type is the "tree" object.  A tree
+object is a list of permission/name/blob data, sorted by name.  In other
+words the tree object is uniquely determined by the set contents, and so
+two separate but identical trees will always share the exact same
+object. 
+
+Again, a "tree" object is just a pure data abstraction: it has no
+history, no signatures, no verification of validity, except that the
+contents are again protected by the hash itself.  So you can trust the
+contents of a tree, the same way you can trust the contents of a blob,
+but you don't know where those contents _came_ from. 
+
+Side note on trees: since a "tree" object is a sorted list of
+"filename+content", you can create a diff between two trees without
+actually having to unpack two trees.  Just ignore all common parts, and
+your diff will look right.  In other words, you can effectively (and
+efficiently) tell the difference between any two random trees by O(n)
+where "n" is the size of the difference, rather than the size of the
+tree. 
+
+Side note 2 on trees: since the name of a "blob" depends entirely and
+exclusively on its contents (ie there are no names or permissions
+involved), you can see trivial renames or permission changes by noticing
+that the blob stayed the same.  However, renames with data changes need
+a smarter "diff" implementation. 
+
+CHANGESET: The "changeset" object is an object that introduces the
+notion of history into the picture.  In contrast to the other objects,
+it doesn't just describe the physical state of a tree, it describes how
+we got there, and why. 
+
+A "changeset" is defined by the tree-object that it results in, the
+parent changesets (zero, one or more) that led up to that point, and a
+comment on what happened. Again, a changeset is not trusted per se:
+the contents are well-defined and "safe" due to the cryptographically
+strong signatures at all levels, but there is no reason to believe that
+the tree is "good" or that the merge information makes sense. The
+parents do not have to actually have any relationship with the result,
+for example.
+
+Note on changesets: unlike real SCM's, changesets do not contain rename
+information or file mode chane information.  All of that is implicit in
+the trees involved (the result tree, and the result trees of the
+parents), and describing that makes no sense in this idiotic file
+manager.
+
+TRUST: The notion of "trust" is really outside the scope of "git", but
+it's worth noting a few things. First off, since everything is hashed
+with SHA1, you _can_ trust that an object is intact and has not been
+messed with by external sources. So the name of an object uniquely
+identifies a known state - just not a state that you may want to trust.
+
+Furthermore, since the SHA1 signature of a changeset refers to the
+SHA1 signatures of the tree it is associated with and the signatures
+of the parent, a single named changeset specifies uniquely a whole
+set of history, with full contents. You can't later fake any step of
+the way once you have the name of a changeset.
+
+So to introduce some real trust in the system, the only thing you need
+to do is to digitally sign just _one_ special note, which includes the
+name of a top-level changeset.  Your digital signature shows others that
+you trust that changeset, and the immutability of the history of
+changesets tells others that they can trust the whole history.
+
+In other words, you can easily validate a whole archive by just sending
+out a single email that tells the people the name (SHA1 hash) of the top
+changeset, and digitally sign that email using something like GPG/PGP.
+
+In particular, you can also have a separate archive of "trust points" or
+tags, which document your (and other peoples) trust.  You may, of
+course, archive these "certificates of trust" using "git" itself, but
+it's not something "git" does for you. 
+
+Another way of saying the same thing: "git" itself only handles content
+integrity, the trust has to come from outside. 
+
+	Current Directory Cache (".dircache/index")
+
+The "current directory cache" is a simple binary file, which contains an
+efficient representation of a virtual directory content at some random
+time.  It does so by a simple array that associates a set of names,
+dates, permissions and content (aka "blob") objects together.  The cache
+is always kept ordered by name, and names are unique at any point in
+time, but the cache has no long-term meaning, and can be partially
+updated at any time. 
+
+In particular, the "current directory cache" certainly does not need to
+be consistent with the current directory contents, but it has two very
+important attributes:
+
+ (a) it can re-generate the full state it caches (not just the directory
+     structure: through the "blob" object it can regenerate the data too)
+
+     As a special case, there is a clear and unambiguous one-way mapping
+     from a current directory cache to a "tree object", which can be
+     efficiently created from just the current directory cache without
+     actually looking at any other data.  So a directory cache at any
+     one time uniquely specifies one and only one "tree" object (but
+     has additional data to make it easy to match up that tree object
+     with what has happened in the directory)
+
+
+and
+
+ (b) it has efficient methods for finding inconsistencies between that
+     cached state ("tree object waiting to be instantiated") and the
+     current state. 
+
+Those are the two ONLY things that the directory cache does.  It's a
+cache, and the normal operation is to re-generate it completely from a
+known tree object, or update/compare it with a live tree that is being
+developed.  If you blow the directory cache away entirely, you haven't
+lost any information as long as you have the name of the tree that it
+described. 
+
+(But directory caches can also have real information in them: in
+particular, they can have the representation of an intermediate tree
+that has not yet been instantiated.  So they do have meaning and usage
+outside of caching - in one sense you can think of the current directory
+cache as being the "work in progress" towards a tree commit).
diff --git a/cache.h b/cache.h
@@ -0,0 +1,93 @@
+#ifndef CACHE_H
+#define CACHE_H
+
+#include <stdio.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <stddef.h>
+#include <stdlib.h>
+#include <stdarg.h>
+#include <errno.h>
+#include <sys/mman.h>
+
+#include <openssl/sha.h>
+#include <zlib.h>
+
+/*
+ * Basic data structures for the directory cache
+ *
+ * NOTE NOTE NOTE! This is all in the native CPU byte format. It's
+ * not even trying to be portable. It's trying to be efficient. It's
+ * just a cache, after all.
+ */
+
+#define CACHE_SIGNATURE 0x44495243	/* "DIRC" */
+struct cache_header {
+	unsigned int signature;
+	unsigned int version;
+	unsigned int entries;
+	unsigned char sha1[20];
+};
+
+/*
+ * The "cache_time" is just the low 32 bits of the
+ * time. It doesn't matter if it overflows - we only
+ * check it for equality in the 32 bits we save.
+ */
+struct cache_time {
+	unsigned int sec;
+	unsigned int nsec;
+};
+
+/*
+ * dev/ino/uid/gid/size are also just tracked to the low 32 bits
+ * Again - this is just a (very strong in practice) heuristic that
+ * the inode hasn't changed.
+ */
+struct cache_entry {
+	struct cache_time ctime;
+	struct cache_time mtime;
+	unsigned int st_dev;
+	unsigned int st_ino;
+	unsigned int st_mode;
+	unsigned int st_uid;
+	unsigned int st_gid;
+	unsigned int st_size;
+	unsigned char sha1[20];
+	unsigned short namelen;
+	unsigned char name[0];
+};
+
+const char *sha1_file_directory;
+struct cache_entry **active_cache;
+unsigned int active_nr, active_alloc;
+
+#define DB_ENVIRONMENT "SHA1_FILE_DIRECTORY"
+#define DEFAULT_DB_ENVIRONMENT ".dircache/objects"
+
+#define cache_entry_size(len) ((offsetof(struct cache_entry,name) + (len) + 8) & ~7)
+#define ce_size(ce) cache_entry_size((ce)->namelen)
+
+#define alloc_nr(x) (((x)+16)*3/2)
+
+/* Initialize the cache information */
+extern int read_cache(void);
+
+/* Return a statically allocated filename matching the sha1 signature */
+extern char *sha1_file_name(unsigned char *sha1);
+
+/* Write a memory buffer out to the sha file */
+extern int write_sha1_buffer(unsigned char *sha1, void *buf, unsigned int size);
+
+/* Read and unpack a sha1 file into memory, write memory to a sha1 file */
+extern void * read_sha1_file(unsigned char *sha1, char *type, unsigned long *size);
+extern int write_sha1_file(char *buf, unsigned len);
+
+/* Convert to/from hex/sha1 representation */
+extern int get_sha1_hex(char *hex, unsigned char *sha1);
+extern char *sha1_to_hex(unsigned char *sha1);	/* static buffer! */
+
+/* General helper functions */
+extern void usage(const char *err);
+
+#endif /* CACHE_H */
diff --git a/cat-file.c b/cat-file.c
@@ -0,0 +1,23 @@
+#include "cache.h"
+
+int main(int argc, char **argv)
+{
+	unsigned char sha1[20];
+	char type[20];
+	void *buf;
+	unsigned long size;
+	char template[] = "temp_git_file_XXXXXX";
+	int fd;
+
+	if (argc != 2 || get_sha1_hex(argv[1], sha1))
+		usage("cat-file: cat-file <sha1>");
+	buf = read_sha1_file(sha1, type, &size);
+	if (!buf)
+		exit(1);
+	fd = mkstemp(template);
+	if (fd < 0)
+		usage("unable to create tempfile");
+	if (write(fd, buf, size) != size)
+		strcpy(type, "bad");
+	printf("%s: %s\n", template, type);
+}