Workers¶
DefaultWorker
¶
All data extractions goes here.
__init__(self, factory)
special
¶
Parameters:
Name | Type | Description | Default |
---|---|---|---|
factory |
BaseFactory |
see |
required |
Source code in extract_emails/workers/default_worker.py
def __init__(self, factory: BaseFactory):
"""
Args:
factory: see `BaseFactory`
"""
self.website_url = factory.website_url
self.browser = factory.browser
self.depth = factory.depth
self.max_links_from_page = factory.max_links_from_page
self.link_filter = factory.link_filter
self.data_extractors = factory.data_extractors
self.links = [[self.website_url]]
self.current_depth = 0
get_data(self)
¶
Extract data from a given website
Source code in extract_emails/workers/default_worker.py
def get_data(self) -> List[PageData]:
"""Extract data from a given website"""
data: List[PageData] = []
while len(self.links):
logger.debug(f"current_depth={self.current_depth}")
if self.depth is not None and self.current_depth > self.depth:
break
self.current_depth += 1
new_data = self._get_new_data()
data.extend(new_data)
return data