Link Filters¶
LinkFilterBase
¶
Base class for link filters
__init__(self, website)
special
¶
Parameters:
Name | Type | Description | Default |
---|---|---|---|
website |
str |
website address (scheme and domain), e.g. https://example.com |
required |
Source code in extract_emails/link_filters/link_filter_base.py
def __init__(self, website: str):
"""
Args:
website: website address (scheme and domain), e.g. https://example.com
"""
self.website = website
filter(self, urls)
¶
Filter links by some parameters
Parameters:
Name | Type | Description | Default |
---|---|---|---|
urls |
Iterable[str] |
List of URLs for filtering |
required |
Returns:
Type | Description |
---|---|
List[str] |
List of filtered URLs |
Source code in extract_emails/link_filters/link_filter_base.py
@abstractmethod
def filter(self, urls: Iterable[str]) -> List[str]:
"""Filter links by some parameters
Args:
urls: List of URLs for filtering
Returns:
List of filtered URLs
"""
get_links(page_source)
staticmethod
¶
Extract all URLs corresponding to current website
Examples:
>>> from extract_emails.link_filters import LinkFilterBase
>>> links = LinkFilterBase.get_links(page_source)
>>> links
["example.com", "/example.com", "https://example2.com"]
Parameters:
Name | Type | Description | Default |
---|---|---|---|
page_source |
str |
HTML page source |
required |
Returns:
Type | Description |
---|---|
List[str] |
List of URLs |
:param str page_source: HTML page source :return: List of URLs
Source code in extract_emails/link_filters/link_filter_base.py
@staticmethod
def get_links(page_source: str) -> List[str]:
"""Extract all URLs corresponding to current website
Examples:
>>> from extract_emails.link_filters import LinkFilterBase
>>> links = LinkFilterBase.get_links(page_source)
>>> links
["example.com", "/example.com", "https://example2.com"]
Args:
page_source: HTML page source
Returns:
List of URLs
:param str page_source: HTML page source
:return: List of URLs
"""
links = RE_LINKS.findall(page_source)
links = [x[1] for x in links]
return links
get_website_address(url)
staticmethod
¶
Extract scheme and domain name from an URL
Examples:
>>> from extract_emails.link_filters import LinkFilterBase
>>> website = LinkFilterBase.get_website_address('https://example.com/list?page=134')
>>> website
'https://example.com/'
Parameters:
Name | Type | Description | Default |
---|---|---|---|
url |
str |
URL for parsing |
required |
Returns:
Type | Description |
---|---|
str |
scheme and domain name from URL, e.g. https://example.com |
Source code in extract_emails/link_filters/link_filter_base.py
@staticmethod
def get_website_address(url: str) -> str:
"""Extract scheme and domain name from an URL
Examples:
>>> from extract_emails.link_filters import LinkFilterBase
>>> website = LinkFilterBase.get_website_address('https://example.com/list?page=134')
>>> website
'https://example.com/'
Args:
url: URL for parsing
Returns:
scheme and domain name from URL, e.g. https://example.com
"""
parsed_url = urlparse(url)
return f"{parsed_url.scheme}://{parsed_url.netloc}/"
DefaultLinkFilter
¶
Default filter for links
filter(self, links)
¶
Will exclude from a list URLs, which not starts with self.website
and not starts with '/'
Examples:
>>> from extract_emails.link_filters import DefaultLinkFilter
>>> test_urls = ["https://example.com/page1.html","/page.html","/page.html", "https://google.com"]
>>> link_filter = DefaultLinkFilter("https://example.com/")
>>> filtered_urls = link_filter.filter(test_urls)
>>> filtered_urls
["https://example.com/page1.html", "https://example.com/page.html"]
Parameters:
Name | Type | Description | Default |
---|---|---|---|
links |
Iterable[str] |
List of links for filtering |
required |
Returns:
Type | Description |
---|---|
List[str] |
Set of filtered URLs |
Source code in extract_emails/link_filters/default_link_filter.py
def filter(self, links: Iterable[str]) -> List[str]:
"""Will exclude from a list URLs, which not starts with `self.website` and not starts with '/'
Examples:
>>> from extract_emails.link_filters import DefaultLinkFilter
>>> test_urls = ["https://example.com/page1.html","/page.html","/page.html", "https://google.com"]
>>> link_filter = DefaultLinkFilter("https://example.com/")
>>> filtered_urls = link_filter.filter(test_urls)
>>> filtered_urls
["https://example.com/page1.html", "https://example.com/page.html"]
Args:
links: List of links for filtering
Returns:
Set of filtered URLs
"""
filtered_urls = []
for link in links:
url = urljoin(self.website, link)
if not url.startswith(self.website):
continue
if url in self.checked_links:
continue
filtered_urls.append(url)
self.checked_links.add(url)
return filtered_urls
ContactInfoLinkFilter
¶
Contact information filter for links.
Only keep the links might contain the contact information.
Examples:
>>> from extract_emails.link_filters import ContactInfoLinkFilter
>>> link_filter = ContactInfoLinkFilter("https://example.com")
>>> filtered_links = link_filter.filter(['/about-us', '/search'])
>>> filtered_links
['https://example.com/about-us']
>>> from extract_emails.link_filters import ContactInfoLinkFilter
>>> link_filter = ContactInfoLinkFilter("https://example.com", use_default=True)
>>> filtered_links = link_filter.filter(['/blog', '/search'])
>>> filtered_links
['https://example.com/blog', 'https://example.com/search']
>>> from extract_emails.link_filters import ContactInfoLinkFilter
>>> link_filter = ContactInfoLinkFilter("https://example.com", use_default=False)
>>> filtered_links = link_filter.filter(['/blog', '/search'])
>>> filtered_links
[]
>>> from extract_emails.link_filters import ContactInfoLinkFilter
>>> link_filter = ContactInfoLinkFilter("https://example.com", contruct_candidates=['search'])
>>> filtered_links = link_filter.filter(['/blog', '/search'])
>>> filtered_links
['https://example.com/search']
__init__(self, website, contruct_candidates=None, use_default=False)
special
¶
Parameters:
Name | Type | Description | Default |
---|---|---|---|
website |
str |
website address (scheme and domain), e.g. https://example.com |
required |
contruct_candidates |
Optional[List[str]] |
keywords for filtering the list of URLs,
default: see |
None |
use_default |
bool |
if no contactinfo urls found and return filtered_urls, default: True |
False |
Source code in extract_emails/link_filters/contact_link_filter.py
def __init__(
self,
website: str,
contruct_candidates: Optional[List[str]] = None,
use_default: bool = False,
):
"""
Args:
website: website address (scheme and domain), e.g. https://example.com
contruct_candidates: keywords for filtering the list of URLs,
default: see `self.default_contruct_candidates`
use_default: if no contactinfo urls found and return filtered_urls,
default: True
"""
super().__init__(website)
self.checked_links = set()
self.candidates = (
contruct_candidates
if contruct_candidates is not None
else self.default_contruct_candidates
)
self.use_default = use_default
filter(self, urls)
¶
Filter out the links without keywords
Parameters:
Name | Type | Description | Default |
---|---|---|---|
urls |
Iterable[str] |
List of URLs for filtering |
required |
Returns:
Type | Description |
---|---|
List[str] |
List of filtered URLs |
Source code in extract_emails/link_filters/contact_link_filter.py
def filter(self, urls: Iterable[str]) -> List[str]:
"""Filter out the links without keywords
Args:
urls: List of URLs for filtering
Returns:
List of filtered URLs
"""
filtered_urls = []
contactinfo_urls = []
for url in urls:
url = urljoin(self.website, url)
if not url.startswith(self.website):
continue
if url in self.checked_links:
continue
filtered_urls.append(url)
self.checked_links.add(url)
for cand in self.candidates:
if cand in url.lower():
contactinfo_urls.append(url)
break
return (
filtered_urls
if len(contactinfo_urls) == 0 and self.use_default
else contactinfo_urls
)