From 74b2e597e3cbc9d9e22752d8db3c3f367683ea7a Mon Sep 17 00:00:00 2001 From: Gaurav Singh <12825441+Grv-Singh@users.noreply.github.com> Date: Sun, 16 Jun 2024 19:54:03 +0530 Subject: [PATCH] Update index.html --- index.html | 496 ++++++++++++++++------------------------------------- 1 file changed, 149 insertions(+), 347 deletions(-) diff --git a/index.html b/index.html index df9cd24..38e3c99 100644 --- a/index.html +++ b/index.html @@ -1,360 +1,162 @@ - - -
- - - - - - - - - - - - - - - - - - - - -
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
-
- -
-
- + unwanted_text = "Note – For data quality reasons, please upload your photos directly from your computer, mobile phone or tablet. Do not upload them (or send from your mobile to your computer) via any other website, social network, or application of the sort of WhatsApp, Viber, Messenger or similar. Please do not modify your photos or their properties in any way. If you do so, your report will be excluded." -- -
- + with open(os.path.join(job_folder, 'Notes.txt'), 'w', encoding='utf-8') as file: + for element in text_elements: + text = element.get_text() # Get text with replaced-
- +# Continue with any other scraping on the main page +# Example: Extract and print all paragraph texts +paragraphs = soup.find_all('p') +for i, p in enumerate(paragraphs): + print(f"Paragraph {i}: {p.text}") + +# Close the browser +driver.quit() + +def extract_report_data(driver): + report_data = {} + table = driver.find_element(By.CLASS_NAME, "admin-question-form") + rows = table.find_elements(By.CSS_SELECTOR, "tr") + for row in rows: + if "Bare - Code of Conduct" in row.text: + break # Stop processing if "Bare - Code of Conduct" is found + cells = row.find_elements(By.CSS_SELECTOR, "td") + if not cells: + continue + question_cell = cells[0] + question = question_cell.text.strip() # Initialize question with the full text of the cell + inputs = row.find_elements(By.CSS_SELECTOR, "input[type='text'], input[type='radio'], input[type='checkbox'], textarea, select") + answers = [] + for input_elem in inputs: + if input_elem.tag_name == 'textarea' or input_elem.get_attribute('type') == 'text': + # Check if the parent TR does not have the specified background color and the input is empty + if input_elem.get_attribute('value').strip() == '' and 'background-color: rgb(128, 128, 128);' not in input_elem.find_element(By.XPATH, './ancestor::tr').get_attribute('style'): + answers.append('Not Answered') + else: + answers.append(input_elem.get_attribute('value').strip()) + elif input_elem.tag_name == 'select': + answers.append(input_elem.find_element(By.CSS_SELECTOR, 'option:checked').text.strip()) + elif input_elem.get_attribute('type') == 'radio' or input_elem.get_attribute('type') == 'checkbox': + if input_elem.is_selected(): + # Using JavaScript to retrieve the next sibling node's text content + label = driver.execute_script( + "return arguments[0].nextSibling.textContent;", input_elem) + answers.append(label.strip()) + + # Join answers with 'and' if more than one checkbox is selected + answer = ' and '.join(filter(None, answers)) + if answer: # Only add to dictionary if answer is not empty + report_data[question] = answer + return report_data + +def data_processing(html_content, job_folder): + soup = BeautifulSoup(html_content, 'html.parser') + data_dict = {} + + # Find the specific
-
-
-
-