Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

添加文件去重功能 #5

Open
wants to merge 4 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
57 changes: 57 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,8 @@ AliPCS-Py 是阿里云盘的非官方 api 和一个命令行运用程序。
- [文件重命名](#文件重命名)
- [拷贝文件](#拷贝文件)
- [删除文件](#删除文件)
- [搜索重复文件](#搜索重复文件)
- [清除重复文件](#清除重复文件)
- [下载文件或目录](#下载文件或目录)
- [播放媒体文件](#播放媒体文件)
- [上传文件](#上传文件)
Expand Down Expand Up @@ -566,6 +568,61 @@ AliPCS-Py remove --file-id ...
| ------------- | ------------ |
| -i, --file-id | TEXT 文件 ID |

## 搜索重复文件

搜索当前用户的全部文件,找出 content-hash 相同的重复文件。
可能需要运行很长时间,默认缓存搜索结果。

```
AliPCS-Py finddup [OPTIONS]

# 单线程工作
AliPCS-Py finddup --thread 1
# 少量搜索
AliPCS-Py finddup --number 200 --save-rate 200
# 大量搜索
AliPCS-Py finddup --number 10000 --save-rate 1000 --thread 20 --no-show-progress
# 输出保存的搜索结果
AliPCS-Py finddup --skip --output --output-path result.txt
# 删除搜索结果
AliPCS-Py finddup --drop
```

### 选项

| Option | Description |
| ------------------------------------------ | -------------------------------- |
| -n,--number | 本次搜索目录数量,默认为 1000 |
| -s,--save-rate | 每搜索多少目录后保存,默认为 500 |
| -d,--drop | 清除上次搜索结果 |
| -S/-nS, --show-progress/--no-show-progress | 显示搜索详细进度 |
| -t,--thread | 线程数,默认为 16 |
| --skip | 跳过本次搜索直接输出结果 |
| -o,--output | 输出查重结果 |
| --output-path | 查重结果输出文件路径 |

## 清除重复文件

根据 finddup 保存的搜索结果,每组相同文件中保留一个,删除其他。
注意!保留的文件是随机的。

```
AliPCS-Py finddup --number 10000 --save-rate 1000 --thread 20 --no-show-progress
# 先模拟运行一遍
AliPCS-Py cleandup -v --dry-run > test.txt
# 真实删除
AliPCS-Py cleandup
```

### 选项

| Option | Description |
| ---------------- | ---------------------------------- |
| --dry-run | 模拟运行不删除 |
| -v, --verbose | 显示细节 |
| -c, --chunk-size | 单次请求删除的文件个数,默认为 100 |
| -t, --thread | 线程数,默认为 16 |

## 下载文件或目录

使用文件路径:
Expand Down
50 changes: 36 additions & 14 deletions alipcs_py/alipcs/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -167,6 +167,7 @@ def list(
limit: int = 100,
url_expire_sec: int = 7200,
next_marker: str = "",
try_times: int = 0,
) -> Tuple[List[PcsFile], str]:
"""List the directory's contents

Expand All @@ -175,20 +176,41 @@ def list(
more, using the returned `next_marker` parameter for next `list` call.
"""

info = self._alipcs.list(
file_id=file_id,
share_id=share_id,
desc=desc,
name=name,
time=time,
size=size,
all=all,
limit=limit,
url_expire_sec=url_expire_sec,
next_marker=next_marker,
)
next_marker = info["next_marker"]
return [PcsFile.from_(v) for v in info.get("items", [])], next_marker
try:
info = self._alipcs.list(
file_id=file_id,
share_id=share_id,
desc=desc,
name=name,
time=time,
size=size,
all=all,
limit=limit,
url_expire_sec=url_expire_sec,
next_marker=next_marker,
)
next_marker = info["next_marker"]
return [PcsFile.from_(v) for v in info.get("items", [])], next_marker
except AliPCSError as e:
if e.error_code == "ParamFlowException" and try_times < 5:
# Frequent requests leads to this error, I guess
from time import sleep as time_sleep

time_sleep(0.2)
return self.list(
file_id,
share_id=share_id,
desc=desc,
name=name,
time=time,
size=size,
all=all,
limit=limit,
url_expire_sec=url_expire_sec,
next_marker=next_marker,
try_times=try_times + 1,
)
raise

def list_iter(
self,
Expand Down
76 changes: 76 additions & 0 deletions alipcs_py/app/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,9 @@
from alipcs_py.commands.cat import cat as _cat
from alipcs_py.commands import file_operators
from alipcs_py.commands.search import search as _search
from alipcs_py.commands.duplicate import drop as _dropdup
from alipcs_py.commands.duplicate import find_all_duplicates as _finddup
from alipcs_py.commands.duplicate import clean_duplicate as _cleandup
from alipcs_py.commands.download import (
download as _download,
Downloader,
Expand Down Expand Up @@ -664,6 +667,79 @@ def ls(
)


@app.command()
@click.option("--number", "-n", type=int, default=1000, help="本次搜索目录数量,默认为 1000")
@click.option("--save-rate", "-s", type=int, default=500, help="每搜索多少目录后保存,默认为 500")
@click.option("--drop", "-d", is_flag=True, help="清除上次搜索结果")
@click.option(
"--show-progress/--no-show-progress",
"-S/-nS",
is_flag=True,
default=True,
help="显示搜索详细进度",
)
@click.option("--thread", "-t", type=int, default=16, help="线程数,默认为 16")
@click.option("--skip", is_flag=True, default=False, help="跳过本次搜索直接输出结果")
@click.option("--output", "-o", is_flag=True, default=False, help="输出查重结果")
@click.option("--output-path", type=str, default="", help="查重结果输出文件路径")
@click.pass_context
@handle_error
@multi_user_do
def finddup(
ctx, number, save_rate, drop, show_progress, thread, skip, output, output_path
):
"""搜索所有文件找出重复文件

\b
耗时可能极长,默认继续上一次的扫描。若结果较多请指定文件输出。

\b
examples:
AliPCS-Py finddup -n 400 -s 200 -nS -t 8 -o
"""
api = _recent_api(ctx)
if not api:
return
if save_rate > number:
print("Error! save_rate should be less than number.")
return
if drop:
_dropdup()
return
if output_path != "" and output:
print("Error! Specify --output first if you want to set output path.")
_finddup(api, number, save_rate, show_progress, thread, skip, output, output_path)


@app.command()
@click.option("--dry-run", is_flag=True, default=False, help="模拟运行不删除")
@click.option("--verbose", "-v", is_flag=True, default=False, help="显示细节")
@click.option("--chunk-size", "-c", type=int, default=100, help="单次请求删除的文件个数,默认为 100")
@click.option("--thread", "-t", type=int, default=16, help="线程数,默认为 16")
@click.pass_context
@handle_error
@multi_user_do
def cleandup(ctx, dry_run, verbose, chunk_size, thread):
"""清除重复文件

\b
根据 finddup 保存的搜索结果,每组相同文件中保留一个,删除其他。
注意!保留的文件是随机的。

\b
examples:
AliPCS-Py finddup -n 5000 -s 1000 -nS
AliPCS-Py cleandup -v --dry-run > result.txt
AliPCS-Py cleandup
"""
api = _recent_api(ctx)
if not api:
return
_cleandup(
api=api, dry_run=dry_run, verbose=verbose, chunk_size=chunk_size, threads=thread
)


@app.command()
@click.argument("keyword", nargs=1, type=str)
@click.option("--include", "-I", type=str, help="筛选包含这个字符串的文件")
Expand Down
Loading