feat: add filter (#61)

tangyoha · Feb 13, 2023 · 3c4d246 · 3c4d246
1 parent 076537c
commit 3c4d246
Show file tree

Hide file tree

Showing 17 changed files with 961 additions and 131 deletions.
diff --git a/.gitignore b/.gitignore
@@ -59,3 +59,5 @@ photo/
 voice/
 video/
 video_note/
+parser.out
+parsetab.py
diff --git a/README.md b/README.md
@@ -6,8 +6,8 @@
 <a href="https://codecov.io/gh/tangyoha/telegram_media_downloader"><img alt="Coverage Status" src="https://codecov.io/gh/tangyoha/telegram_media_downloader/branch/master/graph/badge.svg"></a>
 <a href="https://github.com/tangyoha/telegram_media_downloader/blob/master/LICENSE"><img alt="License: MIT" src="https://black.readthedocs.io/en/stable/_static/license.svg"></a>
 <a href="https://github.com/python/black"><img alt="Code style: black" src="https://img.shields.io/badge/code%20style-black-000000.svg"></a>
-<img alt="Code style: black" src="https://img.shields.io/github/downloads/tangyoha/telegram_media_downloader/total">
-<img alt="Code style: black" src="https://img.shields.io/github/v/release/tangyoha/telegram_media_downloader?display_name=tag&include_prereleases">
+<a href="https://github.com/python/black">
+<img alt="Code style: black" src="https://img.shields.io/github/v/release/tangyoha/telegram_media_downloader?display_name=tag&include_prereleases"></a>
 </p>
 
 <h3 align="center">
@@ -149,6 +149,8 @@ file_name_prefix_split: ' - '
 max_concurrent_transmissions: 1
 web_host: 127.0.0.1
 web_port: 5000
+download_filter:
+  'telegram_chat_id': message_date >= 2022-12-01 00:00:00 and message_date <= 2023-01-17 00:00:00
 ```
 
 - **api_hash**  - The api_hash you got from telegram apps
@@ -160,26 +162,27 @@ web_port: 5000
 - **file_formats** - File types to download for supported media types which are `audio`, `document` and `video`. Default format is `all`, downloads all files.
 - **save_path** - The root directory where you want to store downloaded files.
 - **file_path_prefix** - Store file subfolders, the order of the list is not fixed, can be randomly combined.
-  - `chat_title`      - channel or group title, it will be chat id if not exist title.
-  - `media_datetime`  - media date, also see pyrogram.types.Message.date.strftime("%Y_%m").
-  - `media_type`      - media type, also see `media_types`.
+  - `chat_title`      - Channel or group title, it will be chat id if not exist title.
+  - `media_datetime`  - Media date, also see pyrogram.types.Message.date.strftime("%Y_%m").
+  - `media_type`      - Media type, also see `media_types`.
 - **disable_syslog** - You can choose which types of logs to disable,see `logging._nameToLevel`.
 - **upload_drive** - You can upload file to cloud drive.
   - `enable_upload_file` - Enable upload file, default `false`.
   - `remote_dir` - Where you upload, like `drive_id/drive_name`.
   - `upload_adapter` - Upload file adapter, which can be `rclone`, `aligo`. If it is `rclone`, it supports all `rclone` servers that support uploading. If it is `aligo`, it supports uploading `Ali cloud disk`.
-  - `rclone_path` - RClone exe path, see wiki[how to use rclone](https://github.com/tangyoha/telegram_media_downloader/wiki#how-to-use-rclone)
+  - `rclone_path` - RClone exe path, see [How to use rclone](https://github.com/tangyoha/telegram_media_downloader/wiki/Rclone)
   - `before_upload_file_zip` - Zip file before upload, default `false`.
   - `after_upload_file_delete` - Delete file after upload success, default `false`.
-- **file_name_prefix** - custom file name, use the same as **file_path_prefix**
-  - `message_id` - message id
-  - `file_name` - file name (may be empty)
-  - `caption` - the title of the message (may be empty)
-- **file_name_prefix_split** - custom file name prefix symbol, the default is `-`
+- **file_name_prefix** - Custom file name, use the same as **file_path_prefix**
+  - `message_id` - Message id
+  - `file_name` - File name (may be empty)
+  - `caption` - The title of the message (may be empty)
+- **file_name_prefix_split** - Custom file name prefix symbol, the default is `-`
 - **max_concurrent_transmissions** - Set the maximum amount of concurrent transmissions (uploads & downloads). A value that is too high may result in network related issues. Defaults to 1.
-- **hide_file_name** - whether to hide the web interface file name, default `false`
-- **web_host** - web host
-- **web_port** - web port
+- **hide_file_name** - Whether to hide the web interface file name, default `false`
+- **web_host** - Web host.
+- **web_port** - Web port.
+- **download_filter** - Download filter, see [How to use Filter](https://github.com/tangyoha/telegram_media_downloader/wiki/How-to-use-Filter)
 
 ## Execution
 

diff --git a/README_CN.md b/README_CN.md
@@ -6,9 +6,9 @@
 <a href="https://codecov.io/gh/tangyoha/telegram_media_downloader"><img alt="Coverage Status" src="https://codecov.io/gh/tangyoha/telegram_media_downloader/branch/master/graph/badge.svg"></a>
 <a href="https://github.com/tangyoha/telegram_media_downloader/blob/master/LICENSE"><img alt="License: MIT" src="https://black.readthedocs.io/en/stable/_static/license.svg"></a>
 <a href="https://github.com/python/black"><img alt="Code style: black" src="https://img.shields.io/badge/code%20style-black-000000.svg"></a>
-<img alt="Code style: black" src="https://img.shields.io/github/downloads/tangyoha/telegram_media_downloader/total">
+<a href="https://github.com/python/black">
 <img alt="Code style: black" src="https://img.shields.io/github/v/release/tangyoha/telegram_media_downloader?display_name=tag&include_prereleases">
-</p>
+</a>
 </p>
 
 <h3 align="center">
@@ -141,6 +141,8 @@ file_name_prefix_split: ' - '
 max_concurrent_transmissions: 1
 web_host: 127.0.0.1
 web_port: 5000
+download_filter:
+  'telegram_chat_id': message_date >= 2022-12-01 00:00:00 and message_date <= 2023-01-17 00:00:00
 ```
 
 - **api_hash** - 你从电报应用程序获得的 api_hash
@@ -160,7 +162,7 @@ web_port: 5000
   - `enable_upload_file` - [必填]启用上传文件，默认为`false`
   - `remote_dir` - [必填]你上传的地方
   - `upload_adapter` - [必填]上传文件适配器，可以为`rclone`,`aligo`。如果为`rclone`，则支持rclone所有支持上传的服务器，如果为aligo，则支持上传阿里云盘
-  - `rclone_path`，如果配置`upload_adapter`为`rclone`则为必填，`rclone`的可执行目录，见wiki[如何使用rclone](https://github.com/tangyoha/telegram_media_downloader/wiki#how-to-use-rclone)
+  - `rclone_path`，如果配置`upload_adapter`为`rclone`则为必填，`rclone`的可执行目录，查阅 [如何使用rclone](https://github.com/tangyoha/telegram_media_downloader/wiki/Rclone)
   - `before_upload_file_zip` - 上传前压缩文件，默认为`false`
   - `after_upload_file_delete` - 上传成功后删除文件，默认为`false`
 - **file_name_prefix** - 自定义文件名称,使用和 **file_path_prefix** 一样
@@ -172,6 +174,7 @@ web_port: 5000
 - **hide_file_name** - 是否隐藏web界面文件名称，默认`false`
 - **web_host** - web界面地址
 - **web_port** - web界面端口
+- **download_filter** - 下载过滤器, 查阅 [How to use Filter](https://github.com/tangyoha/telegram_media_downloader/wiki/How-to-use-Filter)
 
 ## 执行
 

diff --git a/media_downloader.py b/media_downloader.py
@@ -16,6 +16,7 @@
 from module.web import get_flask_app, update_download_status
 from utils.log import LogFilter
 from utils.meta import print_meta
+from utils.meta_data import MetaData
 from utils.updates import check_for_updates
 
 logging.basicConfig(
@@ -269,7 +270,6 @@ async def download_media(
                 continue
             file_name, file_format = await _get_media_meta(message, _media, _type)
             media_size = getattr(_media, "file_size", 0)
-
             if _can_download(_type, file_formats, file_format):
                 if _is_exist(file_name):
                     # TODO: check if the file download complete
@@ -325,7 +325,6 @@ async def download_media(
                 _check_download_finish(media_size, download_path, message.id)
                 await app.upload_file(file_name)
 
-                app.downloaded_ids.append(message.id)
             break
         except pyrogram.errors.exceptions.bad_request_400.BadRequest:
             logger.warning(
@@ -434,8 +433,11 @@ async def begin_import(pagination_limit: int):
         api_id=app.api_id,
         api_hash=app.api_hash,
         proxy=app.proxy,
-        max_concurrent_transmissions=app.max_concurrent_transmissions,
     )
+
+    if getattr(client, "max_concurrent_transmissions", None):
+        client.max_concurrent_transmissions = app.max_concurrent_transmissions
+
     await client.start()
     print("Successfully started (Press Ctrl+C to stop)")
 
@@ -467,11 +469,12 @@ async def begin_import(pagination_limit: int):
                 app.last_read_message_id = last_read_message_id
 
     async for message in messages_iter:  # type: ignore
-        if pagination_count != pagination_limit and not app.need_skip_message(
-            message.id
-        ):
-            pagination_count += 1
-            messages_list.append(message)
+        meta_data = MetaData()
+        meta_data.get_meta_data(message)
+        if pagination_count != pagination_limit:
+            if not app.need_skip_message(str(app.chat_id), message.id, meta_data):
+                pagination_count += 1
+                messages_list.append(message)
         else:
             last_read_message_id = await process_messages(
                 client,

diff --git a/module/app.py b/module/app.py
@@ -6,6 +6,9 @@
 from loguru import logger
 
 from module.cloud_drive import CloudDrive, CloudDriveConfig
+from module.filter import Filter
+from utils.format import replace_date_time
+from utils.meta_data import MetaData
 
 # pylint: disable = R0902
 
@@ -37,6 +40,7 @@ def __init__(
         self.config_file: str = config_file
         self.app_data_file: str = app_data_file
         self.application_name: str = application_name
+        self.download_filter = Filter()
 
         self.reset()
 
@@ -84,6 +88,7 @@ def reset(self):
         self.max_concurrent_transmissions: int = 1
         self.web_host: str = "localhost"
         self.web_port: int = 5000
+        self.download_filter_dict: dict = {}
 
     def load_config(self, _config: dict) -> bool:
         """load config from str.
@@ -165,6 +170,15 @@ def load_config(self, _config: dict) -> bool:
         self.web_host = _config.get("web_host", self.web_host)
         self.web_port = _config.get("web_port", self.web_port)
 
+        self.download_filter_dict = _config.get(
+            "download_filter", self.download_filter_dict
+        )
+
+        for key, value in self.download_filter_dict.items():
+            self.download_filter_dict[key] = replace_date_time(value)
+
+        # TODO: add check if expression exist syntax error
+
         self.max_concurrent_transmissions = _config.get(
             "max_concurrent_transmissions", self.max_concurrent_transmissions
         )
@@ -273,19 +287,34 @@ def get_file_name(
             res = f"{message_id}"
         return res
 
-    def need_skip_message(self, message_id: int) -> bool:
+    def need_skip_message(
+        self, chat_id: str, message_id: int, meta_data: MetaData
+    ) -> bool:
         """if need skip download message.
 
         Parameters
         ----------
+        chat_id: str
+            Config.yaml defined
+
         message_id: int
-            readily to download message id
+            Readily to download message id
+
+        meta_data: MetaData
+            Ready to match filter
 
         Returns
         -------
         bool
         """
-        return self.ids_to_retry_dict.get(message_id) is not None
+        if message_id in self.ids_to_retry_dict:
+            return True
+
+        if chat_id in self.download_filter_dict:
+            self.download_filter.set_meta_data(meta_data)
+            return not self.download_filter.exec(self.download_filter_dict[chat_id])
+
+        return False
 
     def update_config(self, immediate: bool = True):
         """update config

diff --git a/module/cloud_drive.py b/module/cloud_drive.py
@@ -72,7 +72,7 @@ def zip_file(local_file_path: str) -> str:
         Zip local file
         """
 
-        zip_file_name = os.path.basename(local_file_path).split(".")[0] + ".zip"
+        zip_file_name = local_file_path.split(".")[0] + ".zip"
         with ZipFile(zip_file_name, "w") as zip_writer:
             zip_writer.write(local_file_path)
 
@@ -104,8 +104,8 @@ async def rclone_upload_file(
                 file_path = local_file_path
 
             cmd = (
-                f'"{drive_config.rclone_path}" copy "{file_path}"'
-                "{remote_dir}/ --create-empty-src-dirs --ignore-existing --progress"
+                f'"{drive_config.rclone_path}" copy "{file_path}" '
+                f"{remote_dir}/ --create-empty-src-dirs --ignore-existing --progress"
             )
             proc = await asyncio.create_subprocess_shell(
                 cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.STDOUT