diff --git a/README.md b/README.md index 027b06ea..b0e95584 100644 --- a/README.md +++ b/README.md @@ -557,6 +557,11 @@ Using this folder as a guide can be particularly helpful for: ```bash python main.py --resume /path/to/your/resume.pdf ``` +- **Using the colled mode:** + If you want to collect job data only to perform any type of data analytics you can use the bot with the `--collect` option. This will store in output/data.json file all data found from linkedin jobs offers. + ```bash + python main.py --collect + ``` ### Troubleshooting Common Issues diff --git a/main.py b/main.py index 1f86f037..82c66f0c 100644 --- a/main.py +++ b/main.py @@ -179,7 +179,12 @@ def create_and_run_bot(parameters, llm_api_key): bot.set_gpt_answerer_and_resume_generator(gpt_answerer_component, resume_generator_manager) bot.set_parameters(parameters) bot.start_login() - bot.start_apply() + if (parameters['collectMode'] == True): + print('Collecting') + bot.start_collect_data() + else: + print('Applying') + bot.start_apply() except WebDriverException as e: logger.error(f"WebDriver error occurred: {e}") except Exception as e: @@ -188,7 +193,8 @@ def create_and_run_bot(parameters, llm_api_key): @click.command() @click.option('--resume', type=click.Path(exists=True, file_okay=True, dir_okay=False, path_type=Path), help="Path to the resume PDF file") -def main(resume: Path = None): +@click.option('--collect', is_flag=True, help="Only collects data job information into data.json file") +def main(collect: False, resume: Path = None): try: data_folder = Path("data_folder") secrets_file, config_file, plain_text_resume_file, output_folder = FileManager.validate_data_folder(data_folder) @@ -198,11 +204,13 @@ def main(resume: Path = None): parameters['uploads'] = FileManager.file_paths_to_dict(resume, plain_text_resume_file) parameters['outputFileDirectory'] = output_folder + parameters['collectMode'] = collect create_and_run_bot(parameters, llm_api_key) except ConfigError as ce: logger.error(f"Configuration error: {str(ce)}") - logger.error(f"Refer to the configuration guide for troubleshooting.") + logger.error(f"Refer to the configuration guide for troubleshooting: https://github.com/feder-cr/Auto_Jobs_Applier_AIHawk?tab=readme-ov-file#configuration {str(ce)}") + except FileNotFoundError as fnf: logger.error(f"File not found: {str(fnf)}") logger.error("Ensure all required files are present in the data folder.") diff --git a/src/aihawk_bot_facade.py b/src/aihawk_bot_facade.py index 1f5930b4..091a46d6 100644 --- a/src/aihawk_bot_facade.py +++ b/src/aihawk_bot_facade.py @@ -77,6 +77,12 @@ def start_apply(self): self.state.validate_state(['logged_in', 'job_application_profile_set', 'gpt_answerer_set', 'parameters_set']) self.apply_component.start_applying() logger.debug("Apply process started successfully") + + def start_collect_data(self): + logger.debug("Starting collecting data process") + self.state.validate_state(['logged_in', 'job_application_profile_set', 'gpt_answerer_set', 'parameters_set']) + self.apply_component.start_collecting_data() + logger.debug("Collecting data process started successfully") def _validate_non_empty(self, value, name): logger.debug(f"Validating that {name} is not empty") diff --git a/src/aihawk_job_manager.py b/src/aihawk_job_manager.py index ef0d87ae..7d8b5f43 100644 --- a/src/aihawk_job_manager.py +++ b/src/aihawk_job_manager.py @@ -72,6 +72,51 @@ def set_resume_generator_manager(self, resume_generator_manager): logger.debug("Setting resume generator manager") self.resume_generator_manager = resume_generator_manager + def start_collecting_data(self): + searches = list(product(self.positions, self.locations)) + random.shuffle(searches) + page_sleep = 0 + minimum_time = 60 * 5 + minimum_page_time = time.time() + minimum_time + + for position, location in searches: + location_url = "&location=" + location + job_page_number = -1 + utils.printyellow(f"Collecting data for {position} in {location}.") + try: + while True: + page_sleep += 1 + job_page_number += 1 + utils.printyellow(f"Going to job page {job_page_number}") + self.next_job_page(position, location_url, job_page_number) + time.sleep(random.uniform(1.5, 3.5)) + utils.printyellow("Starting the collecting process for this page") + self.read_jobs() + utils.printyellow("Collecting data on this page has been completed!") + + time_left = minimum_page_time - time.time() + if time_left > 0: + utils.printyellow(f"Sleeping for {time_left} seconds.") + time.sleep(time_left) + minimum_page_time = time.time() + minimum_time + if page_sleep % 5 == 0: + sleep_time = random.randint(1, 5) + utils.printyellow(f"Sleeping for {sleep_time / 60} minutes.") + time.sleep(sleep_time) + page_sleep += 1 + except Exception: + pass + time_left = minimum_page_time - time.time() + if time_left > 0: + utils.printyellow(f"Sleeping for {time_left} seconds.") + time.sleep(time_left) + minimum_page_time = time.time() + minimum_time + if page_sleep % 5 == 0: + sleep_time = random.randint(50, 90) + utils.printyellow(f"Sleeping for {sleep_time / 60} minutes.") + time.sleep(sleep_time) + page_sleep += 1 + def start_applying(self): logger.debug("Starting job application process") self.easy_applier_component = AIHawkEasyApplier(self.driver, self.resume_path, self.set_old_answers, @@ -214,6 +259,32 @@ def get_jobs_from_page(self): logger.error(f"Error while fetching job elements: {e}") return [] + def read_jobs(self): + try: + no_jobs_element = self.driver.find_element(By.CLASS_NAME, 'jobs-search-two-pane__no-results-banner--expand') + if 'No matching jobs found' in no_jobs_element.text or 'unfortunately, things aren' in self.driver.page_source.lower(): + raise Exception("No more jobs on this page") + except NoSuchElementException: + pass + + job_results = self.driver.find_element(By.CLASS_NAME, "jobs-search-results-list") + utils.scroll_slow(self.driver, job_results) + utils.scroll_slow(self.driver, job_results, step=300, reverse=True) + job_list_elements = self.driver.find_elements(By.CLASS_NAME, 'scaffold-layout__list-container')[0].find_elements(By.CLASS_NAME, 'jobs-search-results__list-item') + if not job_list_elements: + raise Exception("No job class elements found on page") + job_list = [Job(*self.extract_job_information_from_tile(job_element)) for job_element in job_list_elements] + for job in job_list: + if self.is_blacklisted(job.title, job.company, job.link): + utils.printyellow(f"Blacklisted {job.title} at {job.company}, skipping...") + self.write_to_file(job, "skipped") + continue + try: + self.write_to_file(job,'data') + except Exception as e: + self.write_to_file(job, "failed") + continue + def apply_jobs(self): try: no_jobs_element = self.driver.find_element(By.CLASS_NAME, 'jobs-search-two-pane__no-results-banner--expand')