Data Extractors¶
DataExtractor
¶
Base class for all data extractors
name: str
property
readonly
¶
Name of the data extractor, e.g. email, linkedin
get_data(self, page_source)
¶
Extract needed data from a string
Parameters:
Name | Type | Description | Default |
---|---|---|---|
page_source |
str |
webpage content |
required |
Returns:
Type | Description |
---|---|
Set[str] |
Set of data, e.g. {'email@email.com', 'email2@email.com'} |
Source code in extract_emails/data_extractors/data_extractor.py
@abstractmethod
def get_data(self, page_source: str) -> Set[str]:
"""Extract needed data from a string
Args:
page_source: webpage content
Returns:
Set of data, e.g. {'email@email.com', 'email2@email.com'}
"""
EmailExtractor
¶
name: str
property
readonly
¶
Name of the data extractor, e.g. email, linkedin
get_data(self, page_source)
¶
Extract emails from a string
Parameters:
Name | Type | Description | Default |
---|---|---|---|
page_source |
str |
webpage content |
required |
Returns:
Type | Description |
---|---|
Set[str] |
Set of emails, e.g. {'email@email.com', 'email2@email.com'} |
Source code in extract_emails/data_extractors/email_extractor.py
def get_data(self, page_source: str) -> Set[str]:
"""Extract emails from a string
Args:
page_source: webpage content
Returns:
Set of emails, e.g. {'email@email.com', 'email2@email.com'}
"""
raw_emails = [i for i in self.regexp.findall(page_source)]
return email_filter(raw_emails)
LinkedinExtractor
¶
name: str
property
readonly
¶
Name of the data extractor, e.g. email, linkedin
get_data(self, page_source)
¶
Extract links to Linkedin profiles
Parameters:
Name | Type | Description | Default |
---|---|---|---|
page_source |
str |
webpage content |
required |
Returns:
Type | Description |
---|---|
Set[str] |
Set of urls, e.g. {'https://www.linkedin.com/in/venjamin-brant-73381ujy3u'} |
Source code in extract_emails/data_extractors/linkedin_extractor.py
def get_data(self, page_source: str) -> Set[str]:
"""Extract links to Linkedin profiles
Args:
page_source: webpage content
Returns:
Set of urls, e.g. {'https://www.linkedin.com/in/venjamin-brant-73381ujy3u'}
"""
all_urls = self.regexp.findall(page_source)
url_filter = "linkedin.com/in/"
linkedin_urls = set([i[0] for i in all_urls if url_filter in i[0]])
return linkedin_urls