aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rwxr-xr-xcreate-git.sh26
-rwxr-xr-xprocess_directory.sh10
-rwxr-xr-xrewrite-commit-dump.py54
3 files changed, 58 insertions, 32 deletions
diff --git a/create-git.sh b/create-git.sh
index 6389024..667fed0 100755
--- a/create-git.sh
+++ b/create-git.sh
@@ -13,28 +13,24 @@ git config core.logAllRefUpdates false
git config prune.expire now
mkdir -p objects/info
targets=( $(find ../final/ -maxdepth 1 -mindepth 1 -printf '../final/%P/\n' | \
- xargs -n1 readlink -f | tee >(sed -e 's:$:/git/objects:' > objects/info/alternates) ) )
-for x in "${targets[@]}"; do
- rev=$(git --git-dir $x/git rev-list -1 master 2> /dev/null)
- [ -z "$rev" ] && { echo "no content: $x"; continue; }
- x="refs/heads/source/$(basename $x)"
- git update-ref "$x" $rev
-done
-
-echo "linearizing history, and rewriting messages..."
+ xargs -n1 readlink -f | \
+ while read l; do
+ [ -e "$l/cvs2svn-tmp/git-dump.dat" ] || continue;
+ echo "$l/git/objects" >> objects/info/alternates
+ echo "$l"
+ done
+ )
+)
+echo "loading all commits, linearizing, and rewriting history..."
time (
- git fast-export --progress=1000 --all --reverse --date-order --no-data | \
- tee ../export-stream-raw | \
- "${root}/rewrite-commit-dump.py" | \
+ "${root}/rewrite-commit-dump.py" "${targets[@]}" | \
tee ../export-stream-rewritten | \
git fast-import
) 2>&1 | tee git-creation.log
echo "recomposed; repacking and breaking alternate linkage..."
-# Wipe the strong refs to the other repos...
-git ls-remote . refs/heads/source/'*' | awk '{print $2;}' | xargs -n1 git update-ref -d
-# Localize the content...
+# Localize the content we actual use out of the alternates...
time git repack -Adf --window=100 --depth=100
# Wipe the alternates.
rm objects/info/alternates
diff --git a/process_directory.sh b/process_directory.sh
index 14ef28c..a7be6ed 100755
--- a/process_directory.sh
+++ b/process_directory.sh
@@ -16,10 +16,12 @@ f() {
time cvs2git --options config -vv
cd git
git init --bare
- { "${base}/rewrite-blob-data.py" ../cvs2svn-tmp/git-blob.dat;
- cat ../cvs2svn-tmp/git-dump.dat;
- } | git fast-import
- rm -rf "${final}" git-work
+ # Note we're only pull in blob data here; this intentional- we need to
+ # interlace the commit objects together, these git object pools will be
+ # be used as alternates for the final repo combination.
+ "${base}/rewrite-blob-data.py" ../cvs2svn-tmp/git-blob.dat | \
+ git fast-import --export-marks=../cvs2svn-tmp/git-blob.idx
+ rm -rf "${final}"
cd "$root"
mv "$output" "${final}"
set +x
diff --git a/rewrite-commit-dump.py b/rewrite-commit-dump.py
index 7678406..f657a8e 100755
--- a/rewrite-commit-dump.py
+++ b/rewrite-commit-dump.py
@@ -1,5 +1,7 @@
#!/usr/bin/python
import functools
+import operator
+import os
import re
import sys
from collections import namedtuple
@@ -12,10 +14,10 @@ mangler.append(functools.partial(
re.compile(r"^\(portage version: (.*)\)$", re.M|re.I).sub,
r"Package-Manager: portage-\1"))
-fields = ('mark', 'author', 'committer', 'msg', 'files')
+fields = ('author', 'committer', 'msg', 'files', 'timestamp')
record = namedtuple('record', fields)
-def deserialize_records(source):
+def deserialize_records(source, blob_idx):
line = source.readline()
while line:
while line.split()[0] in ('reset', 'progress'):
@@ -28,9 +30,9 @@ def deserialize_records(source):
line = source.readline()
chunks = line.split(None, 1)
assert len(chunks) == 2, line
- if chunks[0] == 'from':
+ if chunks[0] in ('from', 'mark'):
continue
- assert chunks[0] in ('mark', 'author', 'committer', 'data')
+ assert chunks[0] in ('author', 'committer', 'data')
if chunks[0] != 'data':
d[chunks[0]] = chunks[1].strip()
continue
@@ -63,28 +65,39 @@ def deserialize_records(source):
files[mode[1]] = (mode[0], line)
elif mode[0] == 'M':
# M 100644 e8b9ed651c6209820779382edee2537209aba4ae dev-cpp/gtkmm/ChangeLog
- chunks = mode[1].split(None, 3)
- assert len(chunks) == 3, line
- files[chunks[2]] = (mode[0], line)
+ # if it's not a sha1, but startswith ':'... then it's an index.
+ chunks = line.split(None, 4)
+ assert len(chunks) == 4, line
+ fname = chunks[3]
+ if chunks[2][0] == ':':
+ line = ' '.join(chunks[:2] + [blob_idx[int(chunks[2][1:])], fname])
+ files[fname] = (mode[0], line)
else:
raise AssertionError("got unknown file op: mode=%r, line:\n%r" % (mode[0], line))
line = source.readline()
d['files'] = files
# Basic sanity check for the code above...
assert set(fields).issuperset(d), d
+ d.setdefault('author', d.get('committer'))
+ assert d['author'] is not None
+ # Skank the timestamp out...
+ chunks = d['author'].rsplit(None, 1)
+ assert len(chunks) == 2 and chunks[1] == '+0000', d['author']
+ d['timestamp'] = long(chunks[0].rsplit(None, 1)[1])
yield record(*[d.get(x) for x in fields])
# Bleh... of course namedtuple doesn't make this easy.
line = source.readline()
-def serialize_records(records, handle, target='refs/heads/master', progress=1000):
+def serialize_records(records, handle, target='refs/heads/master', progress=5000):
write = handle.write
write('reset %s\n' % target)
total = len(records)
for idx, record in enumerate(records, 1):
if idx % progress == 0:
write('progress %02.1f%%: %i of %i commits\n'
- % ((100 * float(idx))//total, idx, total))
+ % ((100 * float(idx))/total, idx, total))
write('commit %s\n' % target)
+ write('mark :%i\n' % idx)
# fields = ('mark', 'author', 'committer', 'msg', 'files')
for name, value in zip(fields, record):
if name == 'files':
@@ -94,17 +107,32 @@ def serialize_records(records, handle, target='refs/heads/master', progress=1000
write("%s %s\n" % (name, value))
elif name == 'msg':
write("data %i\n%s" % (len(value), value))
+ elif name == 'timestamp':
+ continue
else:
raise AssertionError("serialize is out of sync; don't know field %s" % name)
write("\n")
+def deserialize_blob_map(source):
+ source = (x.strip().split() for x in source)
+ return dict((int(x[0].lstrip(':')), x[1]) for x in source)
+
def main(argv):
- source = open(argv[0], 'r') if argv else sys.stdin
- records = list(deserialize_records(source))
+ records = []
+ source = argv if argv else sys.stdin
+ directories = [x.strip() for x in source]
+ for directory in directories:
+ tmp = os.path.join(directory, 'cvs2svn-tmp')
+ commits = os.path.join(tmp, 'git-dump.dat')
+ if not os.path.exists(commits):
+ sys.stderr.write("skipping %s; no commit data\n" % directory)
+ continue
+ blob_index = deserialize_blob_map(open(os.path.join(tmp, 'git-blob.idx')))
+ records.extend(deserialize_records(open(commits, 'r'), blob_index))
+ records.sort(key=operator.attrgetter('timestamp'))
+ #records = list(deserialize_records(source))
serialize_records(records, sys.stdout)
return 0
if __name__ == '__main__':
- if len(sys.argv) not in (1, 2):
- raise SystemExit("args must be either none, or path to fast-export stream to read", code=1)
sys.exit(main(sys.argv[1:]))