From 1a402504863d7138fbb3c27c164d5737d7f98fc8 Mon Sep 17 00:00:00 2001 From: Stefan Karpinski Date: Wed, 21 Apr 2021 22:58:44 -0400 Subject: [PATCH] Tar.create: include entries for non-empty directories Issue #103 points out that omitting directory entries for non-empty directories can confuse some tools that consume tarballs, including docker, which applies overly restrictive permissions to directories which are not explicitly included in a tarball. This commit changes `Tar.create` and `Tar.rewrite` to produce tarballs which include explicit directory entries for all (non-root) directories. This changes Tar.jl's "canonical format", which is, by design one-to-one with git tree hashes. However, it does not seem like anyone currently depends on that exact reproducibility and it seems worth making this change in order to avoid confusing external consumers of our tarballs. Closes #103. --- README.md | 17 +++++++++-------- src/create.jl | 18 ++++++++++-------- test/runtests.jl | 1 + test/setup.jl | 2 +- 4 files changed, 21 insertions(+), 17 deletions(-) diff --git a/README.md b/README.md index d7dd413..9a6ac8e 100644 --- a/README.md +++ b/README.md @@ -293,14 +293,6 @@ and it still records user and group information, modification times and exact permissions. Coaxing command-line `tar` programs to omit this non-portable information and use a portable (and `git`-compatible sort order) is non-trivial. -Another difference from command-line `tar`: non-empty directories are also -omitted from the tarballs that `Tar` creates since no metadata is recorded about -directories aside from the fact that they exist and the existence of non-empty -directories is already implied by the fact that they contain something else. If, -in the future, the ability to record metadata about directories is added, -tarballs will record entries for non-empty directories with non-default -metadata. - On the extraction side of things, doing `Tar.extract(tarball, dir)` is roughly equivalent to the following commands: ```sh @@ -472,6 +464,15 @@ have the same `git` tree hash, but produce different tarballs. Two _identical_ file trees will always produce identical tarballs, however, and that tarball should remain stable in future versions of the `Tar` package. +**Note:** the canonical tarball format was [changed] slightly in the 1.10 +release of the package. Since that release, the canonical format _includes_ all +directories in the canonical tarball format, whereas previously non-empty +directories were omitted since their existence is implied by their contents. The +absence of explicit directory entries in tarballs confused some external +tooling, so it was deemed worth a small format change to avoid such problems. + +[changed]: https://github.com/JuliaIO/Tar.jl/pull/106 + The `tree_hash` function can be used to compute a git-style tree hash of the contents of a tarball (without needing to extract it). Moreover, two tarballs created by the `Tar` package will have the same hash if and only if they contain diff --git a/src/create.jl b/src/create.jl index 2663a5c..935c7fb 100644 --- a/src/create.jl +++ b/src/create.jl @@ -80,24 +80,26 @@ function write_tarball( tar_path::String = "."; buf::Vector{UInt8} = Vector{UInt8}(undef, DEFAULT_BUFFER_SIZE), ) - w = 0 hdr, data = callback(sys_path, tar_path) if hdr.type == :directory data isa Union{Nothing, AbstractDict{<:AbstractString}} || error("callback must return a dict of strings, got: $(repr(data))") - data !== nothing && for name in sort!(collect(keys(data))) - sys_path′ = data[name] - tar_path′ = tar_path == "." ? name : "$tar_path/$name" - w += write_tarball(callback, tar, sys_path′, tar_path′, buf=buf) - end else data isa Union{Nothing, AbstractString, IO, Tuple{IO,Integer}} || error("callback must return nothing, string or IO, got: $(repr(data))") end - if hdr.type != :directory || w == 0 + w = 0 + if tar_path != "." + w += write_tarball(tar, hdr, data, buf=buf) + end + data isa AbstractDict && for name in sort!(collect(keys(data))) + sys_path′ = data[name] + tar_path′ = tar_path == "." ? name : "$tar_path/$name" + w += write_tarball(callback, tar, sys_path′, tar_path′, buf=buf) + end + if tar_path == "." && w == 0 w += write_tarball(tar, hdr, data, buf=buf) end - @assert w > 0 return w end diff --git a/test/runtests.jl b/test/runtests.jl index 2a5bf15..5c0a47a 100644 --- a/test/runtests.jl +++ b/test/runtests.jl @@ -212,6 +212,7 @@ end # check rewrite tarball′ = Tar.rewrite(tarball) @test Tar.list(tarball′) == [ + Tar.Header("dir", :directory, 0o755, 0, "") Tar.Header("dir/file", :file, 0o755, 0, "") Tar.Header("file", :file, 0o755, 0, "") ] diff --git a/test/setup.jl b/test/setup.jl index 5d2f4e9..b1a7e23 100644 --- a/test/setup.jl +++ b/test/setup.jl @@ -135,7 +135,7 @@ function make_test_dir(gen_skip::Bool=false) return dir end -const test_dir_paths = ["dir/file", "empty", "file", "link"] +const test_dir_paths = ["dir", "dir/file", "empty", "file", "link"] Sys.iswindows() && pop!(test_dir_paths) # uses Tar.list(callback, tarball) API