Skip to content

Link Filters

Base class for link filters

Parameters:

Name Type Description Default
website str

website address (scheme and domain), e.g. https://example.com

required
Source code in extract_emails/link_filters/link_filter_base.py
def __init__(self, website: str):
    """

    Args:
        website: website address (scheme and domain), e.g. https://example.com
    """
    self.website = website

Filter links by some parameters

Parameters:

Name Type Description Default
urls Iterable[str]

List of URLs for filtering

required

Returns:

Type Description
List[str]

List of filtered URLs

Source code in extract_emails/link_filters/link_filter_base.py
@abstractmethod
def filter(self, urls: Iterable[str]) -> List[str]:
    """Filter links by some parameters

    Args:
        urls: List of URLs for filtering

    Returns:
        List of filtered URLs
    """

Extract all URLs corresponding to current website

Examples:

>>> from extract_emails.link_filters import LinkFilterBase
>>> links = LinkFilterBase.get_links(page_source)
>>> links
["example.com", "/example.com", "https://example2.com"]

Parameters:

Name Type Description Default
page_source str

HTML page source

required

Returns:

Type Description
List[str]

List of URLs

:param str page_source: HTML page source :return: List of URLs

Source code in extract_emails/link_filters/link_filter_base.py
@staticmethod
def get_links(page_source: str) -> List[str]:
    """Extract all URLs corresponding to current website

    Examples:
        >>> from extract_emails.link_filters import LinkFilterBase
        >>> links = LinkFilterBase.get_links(page_source)
        >>> links
        ["example.com", "/example.com", "https://example2.com"]

    Args:
        page_source: HTML page source

    Returns:
        List of URLs

    :param str page_source: HTML page source
    :return: List of URLs
    """
    links = RE_LINKS.findall(page_source)
    links = [x[1] for x in links]
    return links

Extract scheme and domain name from an URL

Examples:

>>> from extract_emails.link_filters import LinkFilterBase
>>> website = LinkFilterBase.get_website_address('https://example.com/list?page=134')
>>> website
'https://example.com/'

Parameters:

Name Type Description Default
url str

URL for parsing

required

Returns:

Type Description
str

scheme and domain name from URL, e.g. https://example.com

Source code in extract_emails/link_filters/link_filter_base.py
@staticmethod
def get_website_address(url: str) -> str:
    """Extract scheme and domain name from an URL

    Examples:
        >>> from extract_emails.link_filters import LinkFilterBase
        >>> website = LinkFilterBase.get_website_address('https://example.com/list?page=134')
        >>> website
        'https://example.com/'

    Args:
        url: URL for parsing

    Returns:
        scheme and domain name from URL, e.g. https://example.com

    """
    parsed_url = urlparse(url)
    return f"{parsed_url.scheme}://{parsed_url.netloc}/"

Default filter for links

Will exclude from a list URLs, which not starts with self.website and not starts with '/'

Examples:

>>> from extract_emails.link_filters import DefaultLinkFilter
>>> test_urls = ["https://example.com/page1.html","/page.html","/page.html", "https://google.com"]
>>> link_filter = DefaultLinkFilter("https://example.com/")
>>> filtered_urls = link_filter.filter(test_urls)
>>> filtered_urls
["https://example.com/page1.html", "https://example.com/page.html"]

Parameters:

Name Type Description Default
links Iterable[str]

List of links for filtering

required

Returns:

Type Description
List[str]

Set of filtered URLs

Source code in extract_emails/link_filters/default_link_filter.py
def filter(self, links: Iterable[str]) -> List[str]:
    """Will exclude from a list URLs, which not starts with `self.website` and not starts with '/'

    Examples:
        >>> from extract_emails.link_filters import DefaultLinkFilter
        >>> test_urls = ["https://example.com/page1.html","/page.html","/page.html", "https://google.com"]
        >>> link_filter = DefaultLinkFilter("https://example.com/")
        >>> filtered_urls = link_filter.filter(test_urls)
        >>> filtered_urls
        ["https://example.com/page1.html", "https://example.com/page.html"]

    Args:
        links: List of links for filtering

    Returns:
        Set of filtered URLs
    """
    filtered_urls = []
    for link in links:
        url = urljoin(self.website, link)
        if not url.startswith(self.website):
            continue
        if url in self.checked_links:
            continue
        filtered_urls.append(url)
        self.checked_links.add(url)

    return filtered_urls

Contact information filter for links.

Only keep the links might contain the contact information.

Examples:

>>> from extract_emails.link_filters import ContactInfoLinkFilter
>>> link_filter = ContactInfoLinkFilter("https://example.com")
>>> filtered_links = link_filter.filter(['/about-us', '/search'])
>>> filtered_links
['https://example.com/about-us']
>>> from extract_emails.link_filters import ContactInfoLinkFilter
>>> link_filter = ContactInfoLinkFilter("https://example.com", use_default=True)
>>> filtered_links = link_filter.filter(['/blog', '/search'])
>>> filtered_links
['https://example.com/blog', 'https://example.com/search']
>>> from extract_emails.link_filters import ContactInfoLinkFilter
>>> link_filter = ContactInfoLinkFilter("https://example.com", use_default=False)
>>> filtered_links = link_filter.filter(['/blog', '/search'])
>>> filtered_links
[]
>>> from extract_emails.link_filters import ContactInfoLinkFilter
>>> link_filter = ContactInfoLinkFilter("https://example.com", contruct_candidates=['search'])
>>> filtered_links = link_filter.filter(['/blog', '/search'])
>>> filtered_links
['https://example.com/search']

Parameters:

Name Type Description Default
website str

website address (scheme and domain), e.g. https://example.com

required
contruct_candidates Optional[List[str]]

keywords for filtering the list of URLs, default: see self.default_contruct_candidates

None
use_default bool

if no contactinfo urls found and return filtered_urls, default: True

False
Source code in extract_emails/link_filters/contact_link_filter.py
def __init__(
    self,
    website: str,
    contruct_candidates: Optional[List[str]] = None,
    use_default: bool = False,
):
    """

    Args:
        website: website address (scheme and domain), e.g. https://example.com
        contruct_candidates: keywords for filtering the list of URLs,
            default: see `self.default_contruct_candidates`
        use_default:  if no contactinfo urls found and return filtered_urls,
            default: True
    """
    super().__init__(website)
    self.checked_links = set()
    self.candidates = (
        contruct_candidates
        if contruct_candidates is not None
        else self.default_contruct_candidates
    )
    self.use_default = use_default

Filter out the links without keywords

Parameters:

Name Type Description Default
urls Iterable[str]

List of URLs for filtering

required

Returns:

Type Description
List[str]

List of filtered URLs

Source code in extract_emails/link_filters/contact_link_filter.py
def filter(self, urls: Iterable[str]) -> List[str]:
    """Filter out the links without keywords

    Args:
        urls: List of URLs for filtering

    Returns:
        List of filtered URLs
    """
    filtered_urls = []
    contactinfo_urls = []

    for url in urls:
        url = urljoin(self.website, url)

        if not url.startswith(self.website):
            continue
        if url in self.checked_links:
            continue
        filtered_urls.append(url)
        self.checked_links.add(url)

        for cand in self.candidates:
            if cand in url.lower():
                contactinfo_urls.append(url)
                break

    return (
        filtered_urls
        if len(contactinfo_urls) == 0 and self.use_default
        else contactinfo_urls
    )