diff --git a/Doxyfile b/Doxyfile index 0cdd4af..757b53f 100644 --- a/Doxyfile +++ b/Doxyfile @@ -943,7 +943,7 @@ WARN_LOGFILE = # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING # Note: If this tag is empty the current directory is searched. -INPUT = ./lib/src ./lib/include ./docs/Architecture_documentation +INPUT = ./lib/src ./lib/include ./docs/Architecture_documentation ./search_engine # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses diff --git a/docs/html/____init_____8py.html b/docs/html/____init_____8py.html new file mode 100644 index 0000000..e31b163 --- /dev/null +++ b/docs/html/____init_____8py.html @@ -0,0 +1,117 @@ + + +
+ + + + +
+ Search Engine
+
+ |
+
+Namespaces | |
namespace | search_engine |
namespace | search_engine.search_engine |
+ Search Engine
+
+ |
+
#include "Python.h"
+Macros | |
#define | PY_SSIZE_T_CLEAN |
#define PY_SSIZE_T_CLEAN | +
+ Search Engine
+
+ |
+
#include "Python.h"
+Macros | |
#define | PY_SSIZE_T_CLEAN |
#define PY_SSIZE_T_CLEAN | +
▼Ninverted_index | |
Cdocs | Structure that stores information about a document |
▼Npage_rank | |
CGraph | Represents a directed graph used to compute the PageRank algorithm |
CHello | |
▼Nsearch_engine | |
▼Nsearch_engine | |
▼Ncrawler | |
CCrawler | |
▼Nexceptions | |
CUrlError | |
▼Nhelper | |
▼Nconverter | |
CStringToIntConverter | |
CHello |
+ Search Engine
+
+ |
+
This is the complete list of members for search_engine.search_engine.crawler.Crawler, including all inherited members.
+__init__(self, str url_base, str page_name, str initial_page, list[str] remove_pages=[], bool test_mode=False) | search_engine.search_engine.crawler.Crawler | |
_get_links(self, str current_page) | search_engine.search_engine.crawler.Crawler | protected |
_get_paragraphs(self, str current_page) | search_engine.search_engine.crawler.Crawler | protected |
_validate_url(self, str url) | search_engine.search_engine.crawler.Crawler | protected |
converter | search_engine.search_engine.crawler.Crawler | |
graph | search_engine.search_engine.crawler.Crawler | |
initial_page | search_engine.search_engine.crawler.Crawler | |
page_name | search_engine.search_engine.crawler.Crawler | |
REGEX | search_engine.search_engine.crawler.Crawler | static |
remove_pages | search_engine.search_engine.crawler.Crawler | |
run(self, limit=2) | search_engine.search_engine.crawler.Crawler | |
test_mode | search_engine.search_engine.crawler.Crawler | |
url_base | search_engine.search_engine.crawler.Crawler |
+ Search Engine
+
+ |
+
+Public Member Functions | |
__init__ (self, str url_base, str page_name, str initial_page, list[str] remove_pages=[], bool test_mode=False) | |
run (self, limit=2) | |
+Public Attributes | |
url_base | |
page_name | |
initial_page | |
remove_pages | |
test_mode | |
graph | |
converter | |
+Static Public Attributes | |
REGEX | |
+Protected Member Functions | |
list | _get_links (self, str current_page) |
list | _get_paragraphs (self, str current_page) |
_validate_url (self, str url) | |
A web crawler that traverses a network of web pages, extracts links and paragraphs, +and constructs a graph using the PageRank algorithm. + +This crawler starts from a given base URL and an initial page, and follows links within +the website to collect data such as links and paragraphs. It constructs a graph based on +the relationships between pages. + +@param url_base: The base URL from which the crawler starts. +@param page_name: The initial page to start crawling. +@param remove_pages: A list of pages to exclude from the crawl. (Optional, default is []) + +Attributes +---------- +graph : PyGraph + A graph representation used for PageRank calculations. + +Examples +-------- +>>> crawler = Crawler('https://example.com', '/home') +>>> links, paragraphs = crawler.run() +
search_engine.search_engine.crawler.Crawler.__init__ | +( | ++ | self, | +
+ | + | str | +url_base, | +
+ | + | str | +page_name, | +
+ | + | str | +initial_page, | +
+ | + | list[str] | +remove_pages = [] , |
+
+ | + | bool | +test_mode = False |
+
+ | ) | ++ |
Initialize the Crawler with a base URL, the starting page, and optionally, +a list of pages to exclude. + +@param url_base: The base URL of the website to crawl. +@param page_name: The starting page to begin crawling. +@param remove_pages: Pages to exclude from the crawl (optional). +@param test_mode: Flag to activate the test mode for crawling. (default: False) ++
+
|
+ +protected | +
Retrieve all links from the specified page. + +@param current_page: The current page to fetch links from. + +@return: A list of valid links found on the current page. + +@raises UrlError: If the current page URL is invalid or inaccessible. ++
+
|
+ +protected | +
Retrieve all paragraphs from the specified page. + +@param current_page: The current page to fetch paragraphs from. + +@return: A list of paragraphs' text found on the current page. ++
+
|
+ +protected | +
Validate the format of the given URL. + +@param url: The URL to validate. + +@raises UrlError: If the URL is not valid. ++
search_engine.search_engine.crawler.Crawler.run | +( | ++ | self, | +
+ | + | + | limit = 2 |
+
+ | ) | ++ |
Run the crawler to collect links and paragraphs starting from the base page. +It uses a queue to traverse the links in a breadth-first manner. + +@param limit: The maximum number of pages to crawl in test mode. + +@return: A graph representation of the pages and their links. ++
search_engine.search_engine.crawler.Crawler.converter | +
search_engine.search_engine.crawler.Crawler.graph | +
search_engine.search_engine.crawler.Crawler.initial_page | +
search_engine.search_engine.crawler.Crawler.page_name | +
+
|
+ +static | +
search_engine.search_engine.crawler.Crawler.remove_pages | +
search_engine.search_engine.crawler.Crawler.test_mode | +
search_engine.search_engine.crawler.Crawler.url_base | +
+ Search Engine
+
+ |
+
This is the complete list of members for search_engine.search_engine.exceptions.UrlError, including all inherited members.
+__init__(self, message="Invalid URL") | search_engine.search_engine.exceptions.UrlError | |
message | search_engine.search_engine.exceptions.UrlError |
+ Search Engine
+
+ |
+
+Public Member Functions | |
__init__ (self, message="Invalid URL") | |
+Public Attributes | |
message | |
Exception raised when an invalid URL is provided. + +Inherits from the built-in `ValueError` class. + +Attributes +---------- +message : str + The error message that describes the cause of the exception. + Defaults to "Invalid URL". + +Methods +------- +__init__(self, message: str = "Invalid URL") + Initializes the exception with a custom message or a default message. +
search_engine.search_engine.exceptions.UrlError.__init__ | +( | ++ | self, | +
+ | + | + | message = "Invalid URL" |
+
+ | ) | ++ |
Initializes the UrlError exception with a custom error message. + +Parameters +---------- +message : str, optional + The error message to describe the exception. Default is "Invalid URL". ++
search_engine.search_engine.exceptions.UrlError.message | +
+ Search Engine
+
+ |
+
This is the complete list of members for search_engine.search_engine.helper.converter.StringToIntConverter, including all inherited members.
+
+ Search Engine
+
+ |
+
+Public Member Functions | |
__init__ (self) | |
int | convert (self, str word) |
+Public Attributes | |
string_to_int | |
next_int | |
A class that converts strings to integers. If a string has already been converted, +it returns the previously assigned integer. Otherwise, it assigns a new integer +to the string and returns it. + +Attributes +---------- +string_to_int : dict + A dictionary mapping strings to their corresponding integers. +next_int : int + The next integer to be assigned to a string. + +Methods +------- +__init__() + Initializes the converter with an empty dictionary and sets the next integer to 0. +convert(word: str) -> int + Converts a given string to an integer. If the string has been seen before, + it returns the corresponding integer, otherwise it assigns a new integer to the string. +
search_engine.search_engine.helper.converter.StringToIntConverter.__init__ | +( | ++ | self | ) | ++ |
Initializes the StringToIntConverter with an empty dictionary and sets the next integer +to 0 for string-to-integer mapping. + +Attributes +---------- +string_to_int : dict + An empty dictionary to store the mapping of strings to integers. +next_int : int + Initialized to 0, it represents the next available integer to be assigned to a string. ++
int search_engine.search_engine.helper.converter.StringToIntConverter.convert | +( | ++ | self, | +
+ | + | str | +word | +
+ | ) | ++ |
Convert a string to an integer. If the string has already been seen, +return its corresponding integer. Otherwise, assign a new integer to it. + +Parameters +---------- +word : str + The string to be converted. + +Returns +------- +int + The integer corresponding to the string. ++
search_engine.search_engine.helper.converter.StringToIntConverter.next_int | +
search_engine.search_engine.helper.converter.StringToIntConverter.string_to_int | +
+ Search Engine
+
+ |
+
+Classes | |
class | search_engine.search_engine.helper.converter.StringToIntConverter |
+Namespaces | |
namespace | search_engine |
namespace | search_engine.search_engine |
namespace | search_engine.search_engine.helper |
namespace | search_engine.search_engine.helper.converter |
+ Search Engine
+
+ |
+
+Classes | |
class | search_engine.search_engine.crawler.Crawler |
+Namespaces | |
namespace | search_engine |
namespace | search_engine.search_engine |
namespace | search_engine.search_engine.crawler |
+ Search Engine
+
+ |
+
File in lib/src | Includes file in lib/include |
---|---|
hello.cpp | hello.h |
inverted_index.cpp | inverted_index.h |
page_rank.cpp | page_rank.h |
subtraction.cpp | subtraction.h |
sum.cpp | sum.h |
+ Search Engine
+
+ |
+
+Directories | |
helper | |
+Files | |
__init__.py | |
crawler.py | |
exceptions.py | |
Files | |
_hello.cpp | |
_inverted_index.cpp | |
_page_rank.cpp | |
hello.cpp |
+ Search Engine
+
+ |
+
+ Search Engine
+
+ |
+
+Classes | |
class | search_engine.search_engine.exceptions.UrlError |
+Namespaces | |
namespace | search_engine |
namespace | search_engine.search_engine |
namespace | search_engine.search_engine.exceptions |
+ Search Engine
+
+ |
+
+ Search Engine
+
+ |
+
Csearch_engine.search_engine.crawler.Crawler | |
Cinverted_index::docs | Structure that stores information about a document |
Cpage_rank::Graph | Represents a directed graph used to compute the PageRank algorithm |
CHello | |
Csearch_engine.search_engine.helper.converter.StringToIntConverter | |
▼CValueError | |
Csearch_engine.search_engine.exceptions.UrlError |
+ Search Engine
+
+ |
+
+ + |
+ + |
+ + |
+ + |
+ + |
+ + |
▼Ninverted_index | |
Cdocs | Structure that stores information about a document |
▼Npage_rank | |
CGraph | Represents a directed graph used to compute the PageRank algorithm |
▼Nsearch_engine | |
▼Nsearch_engine | |
▼Ncrawler | |
CCrawler | |
▼Nexceptions | |
CUrlError | |
▼Nhelper | |
▼Nconverter | |
CStringToIntConverter |
+ Search Engine
+
+ |
+
+Namespaces | |
namespace | search_engine |
+ Search Engine
+
+ |
+
+Namespaces | |
namespace | crawler |
namespace | exceptions |
namespace | helper |
+ Search Engine
+
+ |
+
+Classes | |
class | Crawler |
+ Search Engine
+
+ |
+
+Classes | |
class | UrlError |
+ Search Engine
+
+ |
+
+Namespaces | |
namespace | converter |
+ Search Engine
+
+ |
+
+Classes | |
class | StringToIntConverter |
paragraphs = soup.find_all('p') # Find links in tag
- links = [a.get('href') for p in paragraphs for a in p.find_all('a', href=True) + links = [a.get('href') for p in paragraphs for a in p.find_all('a', href=True) if a.get('href').startswith(self.page_name)] return links return [] - + def _get_paragraphs(self, current_page: str) -> list: """ Retrieve all paragraphs from the specified page. - Parameters - ---------- - current_page : str - The current page to fetch paragraphs from. - - Returns - ------- - paragraphs : list of str - A list of paragraphs' text found on the current page. + @param current_page: The current page to fetch paragraphs from. + + @return: A list of paragraphs' text found on the current page. """ response = requests.get(self.url_base + self.page_name + current_page) if response.status_code == 200: @@ -125,37 +105,28 @@ def _validate_url(self, url: str): """ Validate the format of the given URL. - Parameters - ---------- - url : str - The URL to validate. - - Raises - ------ - UrlError - If the URL is not valid. + @param url: The URL to validate. + + @raises UrlError: If the URL is not valid. """ if not re.match(self.REGEX, url): raise UrlError() - + def run(self, limit = 2): """ Run the crawler to collect links and paragraphs starting from the base page. It uses a queue to traverse the links in a breadth-first manner. - Returns - ------- - all_links : list of str - A list of all links found during the crawling process. - all_paragraphs : list of str - A list of all paragraphs found during the crawling process. + @param limit: The maximum number of pages to crawl in test mode. + + @return: A graph representation of the pages and their links. """ # Initialize queue with the starting page queue = deque([self.initial_page]) visited = set() # Track visited pages to avoid processing them multiple times all_links = [] # Store all the links found during crawling all_paragraphs = [] # Store all the paragraphs found during crawling - + run = True counter = 0 @@ -186,13 +157,13 @@ def run(self, limit = 2): # Add new links to the queue if they haven't been visited for link in links: if link not in visited and link not in queue and link not in self.remove_pages: - # Extrai a ultima página + # Extract the last part of the link as the next page next_page = link.split('/')[-1] next_page_int = self.converter.convert(next_page) # Make graph self.graph.add_edge(current_page_int, next_page_int) queue.append(next_page) - + # Only when test_mode is activated if self.test_mode: counter += 1 diff --git a/search_engine/exceptions.py b/search_engine/exceptions.py index 014c885..25267a5 100644 --- a/search_engine/exceptions.py +++ b/search_engine/exceptions.py @@ -1,5 +1,29 @@ class UrlError(ValueError): - """Raised when an invalid URL is provided.""" + """ + Exception raised when an invalid URL is provided. + + Inherits from the built-in `ValueError` class. + + Attributes + ---------- + message : str + The error message that describes the cause of the exception. + Defaults to "Invalid URL". + + Methods + ------- + __init__(self, message: str = "Invalid URL") + Initializes the exception with a custom message or a default message. + """ + def __init__(self, message="Invalid URL"): + """ + Initializes the UrlError exception with a custom error message. + + Parameters + ---------- + message : str, optional + The error message to describe the exception. Default is "Invalid URL". + """ self.message = message - super().__init__(self.message) \ No newline at end of file + super().__init__(self.message) diff --git a/search_engine/helper/converter.py b/search_engine/helper/converter.py index 798ab4c..cc1f981 100644 --- a/search_engine/helper/converter.py +++ b/search_engine/helper/converter.py @@ -1,5 +1,37 @@ class StringToIntConverter: + """ + A class that converts strings to integers. If a string has already been converted, + it returns the previously assigned integer. Otherwise, it assigns a new integer + to the string and returns it. + + Attributes + ---------- + string_to_int : dict + A dictionary mapping strings to their corresponding integers. + next_int : int + The next integer to be assigned to a string. + + Methods + ------- + __init__() + Initializes the converter with an empty dictionary and sets the next integer to 0. + convert(word: str) -> int + Converts a given string to an integer. If the string has been seen before, + it returns the corresponding integer, otherwise it assigns a new integer to the string. + """ + def __init__(self): + """ + Initializes the StringToIntConverter with an empty dictionary and sets the next integer + to 0 for string-to-integer mapping. + + Attributes + ---------- + string_to_int : dict + An empty dictionary to store the mapping of strings to integers. + next_int : int + Initialized to 0, it represents the next available integer to be assigned to a string. + """ self.string_to_int = {} self.next_int = 0 @@ -7,12 +39,12 @@ def convert(self, word: str) -> int: """ Convert a string to an integer. If the string has already been seen, return its corresponding integer. Otherwise, assign a new integer to it. - + Parameters ---------- word : str The string to be converted. - + Returns ------- int @@ -21,4 +53,4 @@ def convert(self, word: str) -> int: if word not in self.string_to_int: self.string_to_int[word] = self.next_int self.next_int += 1 - return self.string_to_int[word] \ No newline at end of file + return self.string_to_int[word]