File size: 8,445 Bytes
c10fb73
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
# Copyright 2026-present, the HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Shared ``cp`` command to copy files between local paths, repositories and buckets.

This single command backs three identical CLI entry points: ``hf cp`` (top-level),
``hf repos cp`` and ``hf buckets cp``. It supports any source/destination combination
of local file, repo/bucket ``hf://`` URI, and ``-`` (stdin/stdout), with two exceptions:
- bucket-to-repo copies are not supported (server limitation), and
- local-to-local copies (use a regular ``cp`` for that).
"""

import os
import sys
from dataclasses import replace
from typing import Annotated

import typer

from huggingface_hub import HfApi
from huggingface_hub.utils import HfUri, SoftTemporaryDirectory, disable_progress_bars, is_hf_uri, parse_hf_uri

from ._cli_utils import TokenOpt, get_hf_api
from ._output import out


CP_EXAMPLES = [
    # Download (repo or bucket -> local / stdout)
    "hf cp hf://username/my-model/config.json",
    "hf cp hf://username/my-model/config.json ./config.json",
    "hf cp hf://datasets/username/my-dataset/data.csv ./data/",
    "hf cp hf://buckets/username/my-bucket/config.json -",
    # Upload (local / stdin -> repo or bucket)
    "hf cp ./model.safetensors hf://username/my-model/model.safetensors",
    "hf cp ./config.json hf://buckets/username/my-bucket/logs/",
    "hf cp - hf://buckets/username/my-bucket/config.json",
    # Remote to remote (repo/bucket -> repo/bucket, server-side when possible)
    "hf cp hf://username/source-model/ hf://username/dest-model/",
    "hf cp hf://datasets/username/my-dataset/processed/ hf://buckets/username/my-bucket/processed/",
    "hf cp hf://buckets/username/my-bucket/logs/ hf://buckets/username/archive-bucket/  # copies contents only",
]


def cp(
    src: Annotated[
        str,
        typer.Argument(help="Source: local file, hf:// URI (repo or bucket), or - for stdin."),
    ],
    dst: Annotated[
        str | None,
        typer.Argument(help="Destination: local path, hf:// URI (repo or bucket), or - for stdout."),
    ] = None,
    token: TokenOpt = None,
) -> None:
    """Copy files between local paths, repositories, and buckets.

    Handles uploads (local/stdin -> repo/bucket), downloads (repo/bucket -> local/stdout) and
    remote-to-remote copies (repo/bucket -> repo/bucket). Bucket-to-repo and local-to-local
    copies are not supported. For directories, use `hf upload`/`hf download` (repos) or
    `hf buckets sync` (buckets).
    """
    api = get_hf_api(token=token)

    src_is_stdin = src == "-"
    dst_is_stdout = dst == "-"
    src_is_hf = is_hf_uri(src)
    dst_is_hf = dst is not None and is_hf_uri(dst)

    # --- Remote to remote: delegate to copy_files (repo/bucket -> repo/bucket) ---
    if src_is_hf and dst_is_hf:
        assert dst is not None  # guaranteed by dst_is_hf
        api.copy_files(src, dst)
        out.result("Successfully copied", src=src, dst=dst)
        return

    # --- At least one side must be a remote hf:// URI (rules out local->local, stdin->local, etc.) ---
    if not src_is_hf and not dst_is_hf:
        if dst is None:
            raise typer.BadParameter("Missing destination. Provide a repo or bucket hf:// URI as DST.")
        raise typer.BadParameter(
            "One of SRC or DST must be a repo (hf://username/...) or bucket (hf://buckets/...) URI."
        )

    # --- Download: repo/bucket -> local file or stdout ---
    if src_is_hf:
        if dst_is_stdout:
            _download_file_to_stdout(api, src)
            return
        _download_file_to_local(api, src, dst)
        return

    # --- Upload: local file or stdin -> repo/bucket ---
    assert dst is not None  # guaranteed: reaching here means dst_is_hf is True
    _upload_file_to_remote(api, src, dst, src_is_stdin=src_is_stdin)


def _download_file_to_stdout(api: HfApi, src: str) -> None:
    uri = parse_hf_uri(src)
    filename = _source_filename(uri, src)
    # Suppress progress bars to avoid polluting the piped output.
    with disable_progress_bars():
        with SoftTemporaryDirectory() as tmp_dir:
            tmp_path = os.path.join(tmp_dir, filename)
            _download_single(api, uri, tmp_path)
            with open(tmp_path, "rb") as f:
                while chunk := f.read(32_000_000):  # 32MB chunks
                    sys.stdout.buffer.write(chunk)


def _download_file_to_local(api: HfApi, src: str, dst: str | None) -> None:
    uri = parse_hf_uri(src)
    filename = _source_filename(uri, src)

    if dst is None:
        local_path = filename
    elif os.path.isdir(dst) or dst.endswith(os.sep) or dst.endswith("/"):
        local_path = os.path.join(dst, filename)
    else:
        local_path = dst

    parent_dir = os.path.dirname(local_path)
    if parent_dir:
        os.makedirs(parent_dir, exist_ok=True)

    _download_single(api, uri, local_path)
    out.result("Successfully downloaded", src=src, dst=local_path)


def _download_single(api: HfApi, uri: HfUri, local_path: str) -> None:
    """Download a single file (repo or bucket) to ``local_path``.

    Used by `_download_file_to_local` and `_download_file_to_stdout`.
    """
    if uri.is_bucket:
        api.download_bucket_files(uri.id, [(uri.path_in_repo, local_path)])
    else:
        # Download into a temporary folder next to the destination (rather than the shared cache)
        # so the final move stays on the same filesystem and is instant. The temp folder is
        # cleaned up automatically once the move is complete.
        parent_dir = os.path.dirname(local_path) or "."
        with SoftTemporaryDirectory(prefix=".tmp", dir=parent_dir) as tmp_dir:
            downloaded_path = api.hf_hub_download(
                repo_id=uri.id,
                repo_type=uri.type,
                filename=uri.path_in_repo,
                revision=uri.revision,
                local_dir=tmp_dir,
            )
            os.replace(downloaded_path, local_path)


def _source_filename(uri: HfUri, src: str) -> str:
    if uri.path_in_repo == "" or src.endswith("/"):
        raise typer.BadParameter(
            "Source path must include a file name, not just a repo/bucket or directory path."
            " Use `hf download` or `hf buckets sync` to copy directories."
        )
    return uri.path_in_repo.rsplit("/", 1)[-1]


def _upload_file_to_remote(api: HfApi, src: str, dst: str, *, src_is_stdin: bool) -> None:
    uri = parse_hf_uri(dst)

    if src_is_stdin:
        if uri.path_in_repo == "" or dst.endswith("/"):
            raise typer.BadParameter("Stdin upload requires a full destination path including filename.")
        data = sys.stdin.buffer.read()
        _upload_single(api, uri, data, uri.path_in_repo)
        out.result("Successfully uploaded", src="stdin", dst=uri.to_uri())
        return

    if os.path.isdir(src):
        raise typer.BadParameter(
            "Source must be a file, not a directory. Use `hf upload` or `hf buckets sync` for directories."
        )
    if not os.path.isfile(src):
        raise typer.BadParameter(f"Source file not found: {src}")

    prefix = uri.path_in_repo
    if prefix == "":
        remote_path = os.path.basename(src)
    elif dst.endswith("/"):
        remote_path = prefix + "/" + os.path.basename(src)
    else:
        remote_path = prefix

    _upload_single(api, uri, src, remote_path)
    out.result("Successfully uploaded", src=src, dst=replace(uri, path_in_repo=remote_path).to_uri())


def _upload_single(api: HfApi, uri: HfUri, source: str | bytes, remote_path: str) -> None:
    """Upload a single file or bytes (to a repo or bucket)."""
    if uri.is_bucket:
        api.batch_bucket_files(uri.id, add=[(source, remote_path)])
    else:
        api.upload_file(
            path_or_fileobj=source,
            path_in_repo=remote_path,
            repo_id=uri.id,
            repo_type=uri.type,
            revision=uri.revision,
        )