import os import sys import requests from pathlib import Path from slugify import slugify from dotenv import load_dotenv load_dotenv() class BoostFetcher: BASE_URL = 'https://boostnote.io/api/' DOC_URI = 'docs' FOLDERS_URI = 'folders' # There also exists perPage and page parameter for docs and folders, so in case root has that many we need to paginate API calss DEBUG_MODE = False def __init__(self, token, base_folder): self.token = token self._processed_docs = set() self._processed_documents_count = 0 self._processed_folders = set() self.folder_cache = {} self.base_folder = Path(base_folder).absolute() self.session = requests.Session() self.session.headers['Authorization'] = f"Bearer {self.token}" def _fetch_folder_data(self, folder_id) -> dict: """ Gets folder data from database """ response = self.session.get( f"{self.BASE_URL}{self.FOLDERS_URI}/{folder_id}" ) return response.json()['folder'] def _fetch_doc_data(self, doc_id) -> dict: """ Get document data from server """ response = self.session.get( f"{self.BASE_URL}{self.DOC_URI}/{doc_id}?" ) return response.json()['doc'] def _process_doc(self, doc_id: str, folder_path: Path): """ Reads and save the document in a folder """ doc = self._fetch_doc_data(doc_id) title = doc['title'] # filename = folder_path / f"{slugify(title)}-{doc_id}.md" filename = folder_path / f"{slugify(title)}.md" print(F"Saving: {filename}") if not os.path.isfile(filename): with filename.open('w') as f: doc_head = doc['head'] if doc_head is not None: doc_content = doc_head['content'] if doc_content is not None: f.write(doc['head']['content']) self._processed_docs.add(doc_id) self._processed_documents_count = self._processed_documents_count + 1 def _get_folder(self, folder_id): """ Returns a folder given it's id Looks of it in the cache first """ if folder_id not in self.folder_cache: self.folder_cache[folder_id] = self._fetch_folder_data(folder_id) return self.folder_cache[folder_id] def _process_folder(self, folder_data: dict): """ Navigate through a directory looking for its documents """ pathname = folder_data['pathname'] workspace_id = folder_data['workspaceId'] child_docs_ids = folder_data['childDocsIds'] if workspace_id is None: workspace_id = 'UnknownWorkspace' if self.DEBUG_MODE is True: print(f"Processing folder {self.base_folder}/{workspace_id}/{pathname}") pathname = pathname[1:] if pathname.startswith('/') else pathname path = self.base_folder / workspace_id / pathname path.mkdir(parents=True, exist_ok=True) for doc_id in child_docs_ids: if doc_id not in self._processed_docs: self._process_doc(doc_id, path) self._processed_folders.add(folder_data['id']) # Processing subfolders for folder_id in folder_data['childFoldersIds']: if folder_id not in self._processed_folders: self._process_folder(self._get_folder(folder_id)) def fetch_folders(self) -> requests.Response: # Fetch all folders (all pages) current_page = 1 response = self.session.get(f"{self.BASE_URL}{self.FOLDERS_URI}?page={current_page}") fetched_folders = response.json()['folders'] num_fetched_folders = len(fetched_folders) while num_fetched_folders > 0: print(f"[INFO]: processing page {current_page}") for folder in fetched_folders: if folder['id'] not in self._processed_folders: self._process_folder(folder) current_page = current_page + 1 response = self.session.get(f"{self.BASE_URL}{self.FOLDERS_URI}?page={current_page}") fetched_folders = response.json()['folders'] num_fetched_folders = len(fetched_folders) if __name__ == '__main__': token = os.environ.get('BOOST_TOKEN') assert token, 'A token is required, please set it in your environment' if len(sys.argv) > 1: base_folder = sys.argv[1] else: base_folder = os.environ.get('BASE_DIR', './backup') fetcher = BoostFetcher(token, base_folder) fetcher.fetch_folders() print(f"Exporter Finished: Fetched {fetcher._processed_documents_count} documents.")