Skip to content

Commit

Permalink
flexible compression
Browse files Browse the repository at this point in the history
  • Loading branch information
ThomasWaldmann committed Apr 25, 2016
1 parent ddc7687 commit 61edea7
Show file tree
Hide file tree
Showing 3 changed files with 74 additions and 3 deletions.
8 changes: 6 additions & 2 deletions borg/archive.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,7 +17,7 @@
from . import xattr
from .compress import Compressor, COMPR_BUFFER
from .constants import * # NOQA
from .helpers import Chunk, Error, uid2user, user2uid, gid2group, group2gid, \
from .helpers import Chunk, create_compression_matcher, Error, uid2user, user2uid, gid2group, group2gid, \
parse_timestamp, to_localtime, format_time, format_timedelta, safe_encode, safe_decode, \
Manifest, Statistics, decode_dict, make_path_safe, StableDict, int_to_bigint, bigint_to_int, bin_to_hex, \
ProgressIndicatorPercent, ChunkIteratorFileWrapper, remove_surrogates, log_multi, \
Expand Down Expand Up @@ -148,6 +148,7 @@ def __init__(self, repository, key, manifest, name, cache=None, create=False,
if create:
self.items_buffer = CacheChunkBuffer(self.cache, self.key, self.stats)
self.chunker = Chunker(self.key.chunk_seed, *chunker_params)
self.compression_matcher = create_compression_matcher()
if name in manifest.archives:
raise self.AlreadyExists(name)
self.last_checkpoint = time.time()
Expand Down Expand Up @@ -592,11 +593,14 @@ def process_file(self, path, st, cache, ignore_inode=False):
}
# Only chunkify the file if needed
if chunks is None:
compress = self.compression_matcher.match(path)
fh = Archive._open_rb(path)
with os.fdopen(fh, 'rb') as fd:
chunks = []
for data in self.chunker.chunkify(fd, fh):
chunks.append(cache.add_chunk(self.key.id_hash(data), Chunk(data), self.stats))
chunks.append(cache.add_chunk(self.key.id_hash(data),
Chunk(data, compress=compress),
self.stats))
if self.show_progress:
self.stats.show_progress(item=item, dt=0.2)
cache.memorize_file(path_hash, st, [c.id for c in chunks])
Expand Down
59 changes: 59 additions & 0 deletions borg/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -1423,3 +1423,62 @@ def scandir_generic(path='.'):

def scandir_inorder(path='.'):
return sorted(scandir(path), key=lambda dirent: dirent.inode())


COMPRESSION_PATTERNS = """
none:*.gz
none:*.tgz
none:*.bz2
none:*.tbz2
none:*.xz
none:*.txz
none:*.lzma
none:*.lzo
none:*.zip
none:*.rar
none:*.7z
none:*.mp3
none:*.ogg
none:*.oga
none:*.flac
none:*.aac
none:*.mp4
none:*.mkv
none:*.m4v
none:*.m4a
none:*.avi
none:*.mpg
none:*.mpeg
none:*.webm
none:*.vob
none:*.ts
none:*.ogv
none:*.mov
none:*.flv
none:*.ogm
none:*.jpg
none:*.jpeg
none:*.png
none:*.gif
none:*.dmg
none:*.rpm
none:*.deb
none:*.msi
# later, auto could mean: use some automatic mechanism to determine whether a
# chunk is compressible, e.g. run lz4 on it and look at the compression level.
# if all above entries are removed, this could be used as only mechanism.
#auto:*
""".strip().split()


def create_compression_matcher(lines=COMPRESSION_PATTERNS, fallback='lz4'):
matcher = PatternMatcher(fallback=fallback)
for line in lines:
line = line.strip()
if line and not line.startswith('#'):
try:
compression, pattern = line.split(':', 1)
except:
continue
matcher.add([parse_pattern(pattern)], compression)
return matcher
10 changes: 9 additions & 1 deletion borg/testsuite/helpers.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@
prune_within, prune_split, get_cache_dir, get_keys_dir, Statistics, is_slow_msgpack, \
yes, TRUISH, FALSISH, DEFAULTISH, \
StableDict, int_to_bigint, bigint_to_int, bin_to_hex, parse_timestamp, CompressionSpec, ChunkerParams, Chunk, \
ProgressIndicatorPercent, ProgressIndicatorEndless, load_excludes, parse_pattern, \
ProgressIndicatorPercent, ProgressIndicatorEndless, load_excludes, parse_pattern, create_compression_matcher, \
PatternMatcher, RegexPattern, PathPrefixPattern, FnmatchPattern, ShellPattern, partial_format, ChunkIteratorFileWrapper
from . import BaseTestCase, environment_variable, FakeInputs

Expand Down Expand Up @@ -915,3 +915,11 @@ def test_chunk_file_wrapper():
cfw = ChunkIteratorFileWrapper(iter([]))
assert cfw.read(2) == b''
assert cfw.exhausted


def test_compression_matcher():
cm = create_compression_matcher(fallback='fallback-compressor')
assert cm.match('test.zip') == 'none'
assert cm.match('test.jpg') == 'none'
assert cm.match('test.mp3') == 'none'
assert cm.match('test') == 'fallback-compressor'

0 comments on commit 61edea7

Please sign in to comment.