Merge pull request #548 from MuilaerteJunior/main

Creating the '--collect' parameter to just collect jobs information
AIHawk-FOSS · Oct 16, 2024 · b4bb280 · b4bb280
2 parents a11381f + 5b55fad
commit b4bb280
Show file tree

Hide file tree

Showing 4 changed files with 93 additions and 3 deletions.
diff --git a/README.md b/README.md
@@ -557,6 +557,11 @@ Using this folder as a guide can be particularly helpful for:
   ```bash
   python main.py --resume /path/to/your/resume.pdf
   ```
+- **Using the colled mode:**
+  If you want to collect job data only to perform any type of data analytics you can use the bot with the `--collect` option. This will store in output/data.json file all data found from linkedin jobs offers.
+  ```bash
+  python main.py --collect
+  ```
 
 
 ### Troubleshooting Common Issues

diff --git a/main.py b/main.py
@@ -179,7 +179,12 @@ def create_and_run_bot(parameters, llm_api_key):
         bot.set_gpt_answerer_and_resume_generator(gpt_answerer_component, resume_generator_manager)
         bot.set_parameters(parameters)
         bot.start_login()
-        bot.start_apply()
+        if (parameters['collectMode'] == True):
+            print('Collecting')
+            bot.start_collect_data()
+        else:
+            print('Applying')
+            bot.start_apply()
     except WebDriverException as e:
         logger.error(f"WebDriver error occurred: {e}")
     except Exception as e:
@@ -188,7 +193,8 @@ def create_and_run_bot(parameters, llm_api_key):
 
 @click.command()
 @click.option('--resume', type=click.Path(exists=True, file_okay=True, dir_okay=False, path_type=Path), help="Path to the resume PDF file")
-def main(resume: Path = None):
+@click.option('--collect', is_flag=True, help="Only collects data job information into data.json file")
+def main(collect: False, resume: Path = None):
     try:
         data_folder = Path("data_folder")
         secrets_file, config_file, plain_text_resume_file, output_folder = FileManager.validate_data_folder(data_folder)
@@ -198,11 +204,13 @@ def main(resume: Path = None):
 
         parameters['uploads'] = FileManager.file_paths_to_dict(resume, plain_text_resume_file)
         parameters['outputFileDirectory'] = output_folder
+        parameters['collectMode'] = collect
 
         create_and_run_bot(parameters, llm_api_key)
     except ConfigError as ce:
         logger.error(f"Configuration error: {str(ce)}")
-        logger.error(f"Refer to the configuration guide for troubleshooting.")
+        logger.error(f"Refer to the configuration guide for troubleshooting: https://github.com/feder-cr/Auto_Jobs_Applier_AIHawk?tab=readme-ov-file#configuration {str(ce)}")
+
     except FileNotFoundError as fnf:
         logger.error(f"File not found: {str(fnf)}")
         logger.error("Ensure all required files are present in the data folder.")

diff --git a/src/aihawk_bot_facade.py b/src/aihawk_bot_facade.py
@@ -77,6 +77,12 @@ def start_apply(self):
         self.state.validate_state(['logged_in', 'job_application_profile_set', 'gpt_answerer_set', 'parameters_set'])
         self.apply_component.start_applying()
         logger.debug("Apply process started successfully")
+
+    def start_collect_data(self):
+        logger.debug("Starting collecting data process")
+        self.state.validate_state(['logged_in', 'job_application_profile_set', 'gpt_answerer_set', 'parameters_set'])
+        self.apply_component.start_collecting_data()
+        logger.debug("Collecting data process started successfully")
 
     def _validate_non_empty(self, value, name):
         logger.debug(f"Validating that {name} is not empty")

diff --git a/src/aihawk_job_manager.py b/src/aihawk_job_manager.py
@@ -72,6 +72,51 @@ def set_resume_generator_manager(self, resume_generator_manager):
         logger.debug("Setting resume generator manager")
         self.resume_generator_manager = resume_generator_manager
 
+    def start_collecting_data(self):
+        searches = list(product(self.positions, self.locations))
+        random.shuffle(searches)
+        page_sleep = 0
+        minimum_time = 60 * 5
+        minimum_page_time = time.time() + minimum_time
+
+        for position, location in searches:
+            location_url = "&location=" + location
+            job_page_number = -1
+            utils.printyellow(f"Collecting data for {position} in {location}.")
+            try:
+                while True:
+                    page_sleep += 1
+                    job_page_number += 1
+                    utils.printyellow(f"Going to job page {job_page_number}")
+                    self.next_job_page(position, location_url, job_page_number)
+                    time.sleep(random.uniform(1.5, 3.5))
+                    utils.printyellow("Starting the collecting process for this page")
+                    self.read_jobs()
+                    utils.printyellow("Collecting data on this page has been completed!")
+
+                    time_left = minimum_page_time - time.time()
+                    if time_left > 0:
+                        utils.printyellow(f"Sleeping for {time_left} seconds.")
+                        time.sleep(time_left)
+                        minimum_page_time = time.time() + minimum_time
+                    if page_sleep % 5 == 0:
+                        sleep_time = random.randint(1, 5)
+                        utils.printyellow(f"Sleeping for {sleep_time / 60} minutes.")
+                        time.sleep(sleep_time)
+                        page_sleep += 1
+            except Exception:
+                pass
+            time_left = minimum_page_time - time.time()
+            if time_left > 0:
+                utils.printyellow(f"Sleeping for {time_left} seconds.")
+                time.sleep(time_left)
+                minimum_page_time = time.time() + minimum_time
+            if page_sleep % 5 == 0:
+                sleep_time = random.randint(50, 90)
+                utils.printyellow(f"Sleeping for {sleep_time / 60} minutes.")
+                time.sleep(sleep_time)
+                page_sleep += 1
+
     def start_applying(self):
         logger.debug("Starting job application process")
         self.easy_applier_component = AIHawkEasyApplier(self.driver, self.resume_path, self.set_old_answers,
@@ -214,6 +259,32 @@ def get_jobs_from_page(self):
             logger.error(f"Error while fetching job elements: {e}")
             return []
 
+    def read_jobs(self):
+        try:
+            no_jobs_element = self.driver.find_element(By.CLASS_NAME, 'jobs-search-two-pane__no-results-banner--expand')
+            if 'No matching jobs found' in no_jobs_element.text or 'unfortunately, things aren' in self.driver.page_source.lower():
+                raise Exception("No more jobs on this page")
+        except NoSuchElementException:
+            pass
+
+        job_results = self.driver.find_element(By.CLASS_NAME, "jobs-search-results-list")
+        utils.scroll_slow(self.driver, job_results)
+        utils.scroll_slow(self.driver, job_results, step=300, reverse=True)
+        job_list_elements = self.driver.find_elements(By.CLASS_NAME, 'scaffold-layout__list-container')[0].find_elements(By.CLASS_NAME, 'jobs-search-results__list-item')
+        if not job_list_elements:
+            raise Exception("No job class elements found on page")
+        job_list = [Job(*self.extract_job_information_from_tile(job_element)) for job_element in job_list_elements] 
+        for job in job_list:            
+            if self.is_blacklisted(job.title, job.company, job.link):
+                utils.printyellow(f"Blacklisted {job.title} at {job.company}, skipping...")
+                self.write_to_file(job, "skipped")
+                continue
+            try:
+                self.write_to_file(job,'data')
+            except Exception as e:
+                self.write_to_file(job, "failed")
+                continue
+
     def apply_jobs(self):
         try:
             no_jobs_element = self.driver.find_element(By.CLASS_NAME, 'jobs-search-two-pane__no-results-banner--expand')