Download¶

Dataset download utilities.

`download(url, target_directory=None, stream=True, skip_if_file_exists=False)` ¶

download(url: str, target_directory: Path | None = None, stream: Literal[True] = True, skip_if_file_exists: bool = False) -> AbstractContextManager[Iterable[bytes]]

download(url: str, target_directory: Path, stream: Literal[False], skip_if_file_exists: bool = False) -> Path

download(url: str, target_directory: Path | None = None, stream: bool = True, skip_if_file_exists: bool = False) -> Path | AbstractContextManager[Iterable[bytes]]

Download a file from the given URL.

Parameters:

Name	Type	Description	Default
`url`	`str`	The URL of the file to download.	required
`target_directory`	`Path \| None`	Optional directory where the file should be saved. Ignored when downloading in streaming mode. If not provided and streaming is disabled, a `ValueError` will be raised.	`None`
`stream`	`bool`	Streams the file contents instead of saving it to the target directory	`True`

Returns:

Type	Description
`Path \| AbstractContextManager[Iterable[bytes]]`	The path of the downloaded file if `target_directory` is
`Path \| AbstractContextManager[Iterable[bytes]]`	specified; otherwise, a streaming response.

Source code in meld/download.py

def download(
    url: str, target_directory: Path | None = None, stream: bool = True, skip_if_file_exists: bool = False
) -> Path | AbstractContextManager[Iterable[bytes]]:
    """
    Download a file from the given URL.

    Args:
        url: The URL of the file to download.
        target_directory: Optional directory where the file should be
            saved. Ignored when downloading in streaming mode. If not
            provided and streaming is disabled, a `ValueError` will be
            raised.
        stream: Streams the file contents instead of saving it to the
            target directory

    Returns:
        The path of the downloaded file if `target_directory` is
        specified; otherwise, a streaming response.
    """

    # Always enable stream here even when "stream" if False for efficiently saving directly to disk
    response = requests.get(url, stream=True)
    # Enables transfer decompression to address https://github.com/psf/requests/issues/2155
    response.raw.decode_content = True
    file_size = int(response.headers.get("content-length", 0))
    content_disposition = response.headers.get("Content-Disposition")
    response.raise_for_status()

    if content_disposition is None or (filename := _parse_content_disposition_filename(content_disposition)) is None:
        # Parse filename from the URL as a fallback
        filename = Path(urlparse(url).path).name

    with_progress = tqdm.wrapattr(response.raw, "read", total=file_size, desc=filename)
    # Directly return the raw streaming response in streaming mode
    if target_directory is None or stream:
        if not stream:
            raise ValueError("Target directory must be given when not downloading in streaming mode")

        return with_progress  # pyright: ignore

    target_path = target_directory / filename
    # Skip downloading if skipping is enabled and the target file already exists
    if skip_if_file_exists and target_path.is_file():
        return target_path

    # Copy the byte stream directly to a file
    with with_progress as data_stream, target_path.open("wb") as file:
        shutil.copyfileobj(data_stream, file)

    return target_path

`extract(path_or_stream, target_directory, members=None, archive_extension=None, member_globs=False)` ¶

Extract files from an archive.

Parameters:

Name	Type	Description	Default
`path_or_stream`	`Iterable[bytes] \| Path`	The file path or bytestream of the archive.	required
`target_directory`	`Path`	The directory where the files should be extracted.	required
`members`	`Iterable[str \| Path] \| None`	Specific members to extract. If None, all members are extracted.	`None`
`archive_extension`	`str \| None`	The extension of the archive (optional if path_or_stream is a Path).	`None`
`member_globs`	`bool`	Whether to interpret the members parameter as globs.	`False`

Returns:

Type	Description
`list[Path]`	A list of the extracted file paths.

Source code in meld/download.py

def extract(
    path_or_stream: Iterable[bytes] | Path,
    target_directory: Path,
    members: Iterable[str | Path] | None = None,
    archive_extension: str | None = None,
    member_globs: bool = False,
) -> list[Path]:
    """
    Extract files from an archive.

    Args:
        path_or_stream: The file path or bytestream of the archive.
        target_directory: The directory where the files should be
            extracted.
        members: Specific members to extract. If None, all members are
            extracted.
        archive_extension: The extension of the archive (optional if
            path_or_stream is a Path).
        member_globs: Whether to interpret the members parameter as
            globs.

    Returns:
        A list of the extracted file paths.
    """

    if isinstance(path_or_stream, Path):
        if archive_extension is None:
            archive_extension = "".join(path_or_stream.suffixes)
    elif archive_extension is None:
        raise ValueError("Archive type needs to be specified when decoding a bytestream")

    extractor = _get_extractor(archive_extension)
    return extractor(path_or_stream, target_directory, members, member_globs)

`extract_tar(path_or_stream, target_directory, members=None, member_globs=False)` ¶

Extract a tar archive.

Parameters:

Name	Type	Description	Default
`path_or_stream`	`Iterable[bytes] \| Path`	The file path or bytestream of the tar file.	required
`target_directory`	`Path`	The directory where the files should be extracted.	required
`members`	`Iterable[str \| Path] \| None`	Specific members to extract. If None, all members are extracted.	`None`
`member_globs`	`bool`	Whether to interpret the members parameter as globs.	`False`

Returns:

Type	Description
`list[Path]`	A list of the extracted file paths

Source code in meld/download.py

def extract_tar(
    path_or_stream: Iterable[bytes] | Path,
    target_directory: Path,
    members: Iterable[str | Path] | None = None,
    member_globs: bool = False,
) -> list[Path]:
    """
    Extract a tar archive.

    Args:
        path_or_stream: The file path or bytestream of the tar file.
        target_directory: The directory where the files should be
            extracted.
        members: Specific members to extract. If None, all members are
            extracted.
        member_globs: Whether to interpret the members parameter as
            globs.

    Returns:
        A list of the extracted file paths
    """

    file_argument: dict[Literal["name", "fileobj"], Any]
    if isinstance(path_or_stream, Path):
        file_argument = {"name": path_or_stream}
    else:
        file_argument = {"fileobj": path_or_stream}

    if members is None:
        # Safest default
        filter = "data"
    else:
        if member_globs:
            globs = list(map(str, members))

            # Checks for each path whether they match one of the specified globs
            def tar_filter(tar_info: tarfile.TarInfo, _: str) -> tarfile.TarInfo | None:
                if any(Path(tar_info.name).match(glob) for glob in globs):
                    return tar_info
        else:
            paths = set(map(str, members))

            # Matches paths literally
            def tar_filter(tar_info: tarfile.TarInfo, _: str) -> tarfile.TarInfo | None:
                if tar_info.name in paths:
                    return tar_info

        filter = tar_filter

    with tarfile.open(**file_argument, mode="r") as tar:
        tar.extractall(
            target_directory,
            filter=filter,
        )

    if members is None:
        return [
            path_or_stream
            for directory, _, files in target_directory.walk()
            for file in files
            if (path_or_stream := directory / file).is_file()
        ]

    return list(map(Path, members))

`extract_zip(path, target_directory, members=None, member_globs=False)` ¶

Extract a zip archive.

Parameters:

Name	Type	Description	Default
`path`	`Iterable[bytes] \| Path`	The file path or bytestream of the zip file.	required
`target_directory`	`Path`	The directory where the files should be extracted.	required
`members`	`Iterable[str \| Path] \| None`	Specific members to extract. If None, all members are extracted.	`None`
`member_globs`	`bool`	Whether to interpret the members parameter as globs.	`False`

Returns:

Type	Description
`list[Path]`	A list of the extracted file paths.

Source code in meld/download.py

def extract_zip(
    path: Iterable[bytes] | Path,
    target_directory: Path,
    members: Iterable[str | Path] | None = None,
    member_globs: bool = False,
) -> list[Path]:
    """
    Extract a zip archive.

    Args:
        path: The file path or bytestream of the zip file.
        target_directory: The directory where the files should be
            extracted.
        members: Specific members to extract. If None, all members are
            extracted.
        member_globs: Whether to interpret the members parameter as
            globs.

    Returns:
        A list of the extracted file paths.
    """

    if not isinstance(path, Path):
        raise NotImplementedError("Streaming decompression not supported for zip files")

    with ZipFile(path) as file:
        if member_globs and members is not None:
            globs = list(map(str, members))
            members = [path for path in map(Path, file.namelist()) if any(path.match(glob) for glob in globs)]

        file.extractall(target_directory, None if members is None else map(str, members))
        if members is None:
            return list(map(Path, file.namelist()))
        return list(map(Path, members))

`git_download(repo_url, revision, target_directory, files, keep_repo=False)` ¶

Clone a specific revision of a Git repository and extract specified files to a target directory.

Parameters:

Name	Type	Description	Default
`repo_url`	`str`	URL of the Git repository to clone.	required
`revision`	`str`	Revision of the repository to download.	required
`target_directory`	`Path`	Directory where the extracted files from the repository will be stored.	required
`files`	`Sequence[PathLike[str] \| str]`	List of file paths relative to extract relative to the root of the Git repository.	required
`keep_repo`	`bool`	Keeps the cloned repository in the target_directory instead of a temporary directory for future re-use	`False`

Source code in meld/download.py

def git_download(
    repo_url: str, revision: str, target_directory: Path, files: Sequence[PathLike[str] | str], keep_repo: bool = False
) -> None:
    """
    Clone a specific revision of a Git repository and extract specified files to a target directory.

    Args:
        repo_url: URL of the Git repository to clone.
        revision: Revision of the repository to download.
        target_directory: Directory where the extracted files from the
            repository will be stored.
        files: List of file paths relative to extract relative to the
            root of the Git repository.
        keep_repo: Keeps the cloned repository in the target_directory
            instead of a temporary directory for future re-use
    """

    with contextlib.nullcontext(target_directory) if keep_repo else TemporaryDirectory() as tmpdirname:
        temp_path = Path(tmpdirname) / "temp_repo.git"
        # Only clone the repository if has not already been cloned in keep_repo mode previously
        if keep_repo and temp_path.exists():
            repo = Repo(temp_path)
        else:
            repo = Repo.clone_from(repo_url, temp_path, _GitProgress(), multi_options=["--bare", "--depth 1"])
            # Shallow fetch and checkout the requested revision if necessary
            if repo.head.commit.hexsha != revision:
                result = repo.remote().fetch(revision, progress=_GitProgress(), depth=1)
                assert result[0].commit.hexsha == revision

        stream = BytesIO()
        repo.archive(stream, revision, path=files)
        # Seek back to start to read the in-memory buffer
        stream.seek(0)

        with tarfile.open(fileobj=stream) as tar:
            target_dir = target_directory
            tar.extractall(target_dir)

Download¶

download(url, target_directory=None, stream=True, skip_if_file_exists=False) ¶

extract(path_or_stream, target_directory, members=None, archive_extension=None, member_globs=False) ¶

extract_tar(path_or_stream, target_directory, members=None, member_globs=False) ¶

extract_zip(path, target_directory, members=None, member_globs=False) ¶

git_download(repo_url, revision, target_directory, files, keep_repo=False) ¶

`download(url, target_directory=None, stream=True, skip_if_file_exists=False)` ¶

`extract(path_or_stream, target_directory, members=None, archive_extension=None, member_globs=False)` ¶

`extract_tar(path_or_stream, target_directory, members=None, member_globs=False)` ¶

`extract_zip(path, target_directory, members=None, member_globs=False)` ¶

`git_download(repo_url, revision, target_directory, files, keep_repo=False)` ¶