-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathtest_googlesearch.py
95 lines (76 loc) · 3.02 KB
/
test_googlesearch.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
from bs4 import BeautifulSoup
import httpx
from urllib.parse import quote
import os
import requests
import re
from pdfminer.high_level import extract_text
# Set up the environment variables (for testing purposes, set them directly)
# os.environ['GOOGLE_API_KEY'] = 'your_google_api_key_here' # Replace with your actual Google API key
my_api_key = os.getenv('GOOGLE_API_KEY')
my_cse_id= '754370c1b456c454f'
def clean_html(raw_html):
soup = BeautifulSoup(raw_html, "html.parser")
clean_text = soup.get_text()
# Optionally, you can also remove references or other unwanted parts here
return clean_text
from sklearn.feature_extraction.text import TfidfVectorizer
def convert_spaces_to_percent20(text):
return quote(text, safe='')
def perform_google_search_legislation(query: str, page: int = 1):
"""
Performs a Google Custom Search and prints the results.
Args:
query (str): The search query.
page (int): The page number of the search results to retrieve.
Returns:
dict: The search results.
"""
# Retrieve API key and search engine ID from environment variables
API_KEY = os.getenv('GOOGLE_API_KEY')
SEARCH_ENGINE_ID = '754370c1b456c454f'
# Calculate the start index for the results on the specified page
start = (page - 1) * 3 + 1
# Construct the API request URL
url = f"https://www.googleapis.com/customsearch/v1?key={API_KEY}&cx={SEARCH_ENGINE_ID}&q={query}&start={start}&num=3"
# Make the API request
response = requests.get(url)
print(response)
if response.status_code != 200:
print(f"Failed to fetch data: HTTP {response.status_code}")
return
# Parse the JSON response
data = response.json()
# Get the result items
search_items = data.get("items")
if not search_items:
print("No results found.")
return
results = []
# Iterate over the results found
for i, search_item in enumerate(search_items, start=1):
# Extract data from each result
result_dict = {
"result_number": i + start - 1,
"title": search_item.get("title"),
"description": search_item.get("snippet"),
"long_description": search_item.get("pagemap", {}).get("metatags", [{}])[0].get("og:description", "N/A"),
"url": search_item.get("link")
}
results.append(result_dict)
return {"results": results}
# Test the perform_google_search_legislation function
def test_perform_google_search_legislation():
query = "What is the legislation on AI in the UK?"
page = 1
results = perform_google_search_legislation(query, page)
if results:
for result in results["results"]:
print(f"Result #{result['result_number']}")
print(f"Title: {result['title']}")
print(f"Description: {result['description']}")
print(f"Long Description: {result['long_description']}")
print(f"URL: {result['url']}")
print()
# Call the test function
test_perform_google_search_legislation()