Skip to content

Link Filters

Bases: ABC

Base class for link filters

Source code in extract_emails/link_filters/link_filter_base.py
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
class LinkFilterBase(ABC):
    """Base class for link filters"""

    def __init__(self, website: str):
        """

        Args:
            website: website address (scheme and domain), e.g. https://example.com
        """
        self.website = website

    @staticmethod
    def get_website_address(url: str) -> str:
        """Extract scheme and domain name from an URL

        Examples:
            >>> from extract_emails.link_filters import LinkFilterBase
            >>> website = LinkFilterBase.get_website_address('https://example.com/list?page=134')
            >>> website
            'https://example.com/'

        Args:
            url: URL for parsing

        Returns:
            scheme and domain name from URL, e.g. https://example.com

        """
        parsed_url = urlparse(url)
        return f"{parsed_url.scheme}://{parsed_url.netloc}/"

    @staticmethod
    def get_links(page_source: str) -> list[str]:
        """Extract all URLs corresponding to current website

        Examples:
            >>> from extract_emails.link_filters import LinkFilterBase
            >>> links = LinkFilterBase.get_links(page_source)
            >>> links
            ["example.com", "/example.com", "https://example2.com"]

        Args:
            page_source: HTML page source

        Returns:
            List of URLs

        :param str page_source: HTML page source
        :return: List of URLs
        """
        links = RE_LINKS.findall(page_source)
        links = [x[1] for x in links]
        return links

    @abstractmethod
    def filter(self, urls: Iterable[str]) -> list[str]:
        """Filter links by some parameters

        Args:
            urls: List of URLs for filtering

        Returns:
            List of filtered URLs
        """

Parameters:

Name Type Description Default
website str

website address (scheme and domain), e.g. https://example.com

required
Source code in extract_emails/link_filters/link_filter_base.py
12
13
14
15
16
17
18
def __init__(self, website: str):
    """

    Args:
        website: website address (scheme and domain), e.g. https://example.com
    """
    self.website = website

Filter links by some parameters

Parameters:

Name Type Description Default
urls Iterable[str]

List of URLs for filtering

required

Returns:

Type Description
list[str]

List of filtered URLs

Source code in extract_emails/link_filters/link_filter_base.py
63
64
65
66
67
68
69
70
71
72
@abstractmethod
def filter(self, urls: Iterable[str]) -> list[str]:
    """Filter links by some parameters

    Args:
        urls: List of URLs for filtering

    Returns:
        List of filtered URLs
    """

Extract all URLs corresponding to current website

Examples:

>>> from extract_emails.link_filters import LinkFilterBase
>>> links = LinkFilterBase.get_links(page_source)
>>> links
["example.com", "/example.com", "https://example2.com"]

Parameters:

Name Type Description Default
page_source str

HTML page source

required

Returns:

Type Description
list[str]

List of URLs

:param str page_source: HTML page source :return: List of URLs

Source code in extract_emails/link_filters/link_filter_base.py
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
@staticmethod
def get_links(page_source: str) -> list[str]:
    """Extract all URLs corresponding to current website

    Examples:
        >>> from extract_emails.link_filters import LinkFilterBase
        >>> links = LinkFilterBase.get_links(page_source)
        >>> links
        ["example.com", "/example.com", "https://example2.com"]

    Args:
        page_source: HTML page source

    Returns:
        List of URLs

    :param str page_source: HTML page source
    :return: List of URLs
    """
    links = RE_LINKS.findall(page_source)
    links = [x[1] for x in links]
    return links

Extract scheme and domain name from an URL

Examples:

>>> from extract_emails.link_filters import LinkFilterBase
>>> website = LinkFilterBase.get_website_address('https://example.com/list?page=134')
>>> website
'https://example.com/'

Parameters:

Name Type Description Default
url str

URL for parsing

required

Returns:

Type Description
str

scheme and domain name from URL, e.g. https://example.com

Source code in extract_emails/link_filters/link_filter_base.py
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
@staticmethod
def get_website_address(url: str) -> str:
    """Extract scheme and domain name from an URL

    Examples:
        >>> from extract_emails.link_filters import LinkFilterBase
        >>> website = LinkFilterBase.get_website_address('https://example.com/list?page=134')
        >>> website
        'https://example.com/'

    Args:
        url: URL for parsing

    Returns:
        scheme and domain name from URL, e.g. https://example.com

    """
    parsed_url = urlparse(url)
    return f"{parsed_url.scheme}://{parsed_url.netloc}/"

Bases: LinkFilterBase

Default filter for links

Source code in extract_emails/link_filters/default_link_filter.py
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
class DefaultLinkFilter(LinkFilterBase):
    """Default filter for links"""

    def __init__(self, website: str):
        super().__init__(website)
        self.checked_links: set[str] = set()

    def filter(self, links: Iterable[str]) -> list[str]:
        """Will exclude from a list URLs, which not starts with `self.website` and not starts with '/'

        Examples:
            >>> from extract_emails.link_filters import DefaultLinkFilter
            >>> test_urls = ["https://example.com/page1.html","/page.html","/page.html", "https://google.com"]
            >>> link_filter = DefaultLinkFilter("https://example.com/")
            >>> filtered_urls = link_filter.filter(test_urls)
            >>> filtered_urls
            ["https://example.com/page1.html", "https://example.com/page.html"]

        Args:
            links: List of links for filtering

        Returns:
            Set of filtered URLs
        """
        filtered_urls = []
        for link in links:
            url = urljoin(self.website, link)
            if not url.startswith(self.website):
                continue
            if url in self.checked_links:
                continue
            filtered_urls.append(url)
            self.checked_links.add(url)

        return filtered_urls

Will exclude from a list URLs, which not starts with self.website and not starts with '/'

Examples:

>>> from extract_emails.link_filters import DefaultLinkFilter
>>> test_urls = ["https://example.com/page1.html","/page.html","/page.html", "https://google.com"]
>>> link_filter = DefaultLinkFilter("https://example.com/")
>>> filtered_urls = link_filter.filter(test_urls)
>>> filtered_urls
["https://example.com/page1.html", "https://example.com/page.html"]

Parameters:

Name Type Description Default
links Iterable[str]

List of links for filtering

required

Returns:

Type Description
list[str]

Set of filtered URLs

Source code in extract_emails/link_filters/default_link_filter.py
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
def filter(self, links: Iterable[str]) -> list[str]:
    """Will exclude from a list URLs, which not starts with `self.website` and not starts with '/'

    Examples:
        >>> from extract_emails.link_filters import DefaultLinkFilter
        >>> test_urls = ["https://example.com/page1.html","/page.html","/page.html", "https://google.com"]
        >>> link_filter = DefaultLinkFilter("https://example.com/")
        >>> filtered_urls = link_filter.filter(test_urls)
        >>> filtered_urls
        ["https://example.com/page1.html", "https://example.com/page.html"]

    Args:
        links: List of links for filtering

    Returns:
        Set of filtered URLs
    """
    filtered_urls = []
    for link in links:
        url = urljoin(self.website, link)
        if not url.startswith(self.website):
            continue
        if url in self.checked_links:
            continue
        filtered_urls.append(url)
        self.checked_links.add(url)

    return filtered_urls

Bases: LinkFilterBase

Contact information filter for links.

Only keep the links might contain the contact information.

Examples:

>>> from extract_emails.link_filters import ContactInfoLinkFilter
>>> link_filter = ContactInfoLinkFilter("https://example.com")
>>> filtered_links = link_filter.filter(['/about-us', '/search'])
>>> filtered_links
['https://example.com/about-us']
>>> from extract_emails.link_filters import ContactInfoLinkFilter
>>> link_filter = ContactInfoLinkFilter("https://example.com", use_default=True)
>>> filtered_links = link_filter.filter(['/blog', '/search'])
>>> filtered_links
['https://example.com/blog', 'https://example.com/search']
>>> from extract_emails.link_filters import ContactInfoLinkFilter
>>> link_filter = ContactInfoLinkFilter("https://example.com", use_default=False)
>>> filtered_links = link_filter.filter(['/blog', '/search'])
>>> filtered_links
[]
>>> from extract_emails.link_filters import ContactInfoLinkFilter
>>> link_filter = ContactInfoLinkFilter("https://example.com", contruct_candidates=['search'])
>>> filtered_links = link_filter.filter(['/blog', '/search'])
>>> filtered_links
['https://example.com/search']
Source code in extract_emails/link_filters/contact_link_filter.py
  7
  8
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
class ContactInfoLinkFilter(LinkFilterBase):
    """Contact information filter for links.

    Only keep the links might contain the contact information.

    Examples:
        >>> from extract_emails.link_filters import ContactInfoLinkFilter
        >>> link_filter = ContactInfoLinkFilter("https://example.com")
        >>> filtered_links = link_filter.filter(['/about-us', '/search'])
        >>> filtered_links
        ['https://example.com/about-us']


        >>> from extract_emails.link_filters import ContactInfoLinkFilter
        >>> link_filter = ContactInfoLinkFilter("https://example.com", use_default=True)
        >>> filtered_links = link_filter.filter(['/blog', '/search'])
        >>> filtered_links
        ['https://example.com/blog', 'https://example.com/search']

        >>> from extract_emails.link_filters import ContactInfoLinkFilter
        >>> link_filter = ContactInfoLinkFilter("https://example.com", use_default=False)
        >>> filtered_links = link_filter.filter(['/blog', '/search'])
        >>> filtered_links
        []

        >>> from extract_emails.link_filters import ContactInfoLinkFilter
        >>> link_filter = ContactInfoLinkFilter("https://example.com", contruct_candidates=['search'])
        >>> filtered_links = link_filter.filter(['/blog', '/search'])
        >>> filtered_links
        ['https://example.com/search']
    """

    default_contruct_candidates = [
        "about",
        "about-us",
        "aboutus",
        "contact",
        "contact-us",
        "contactus",
    ]

    checked_links: set[str] = set()

    def __init__(
        self,
        website: str,
        contruct_candidates: list[str] | None = None,
        use_default: bool = False,
    ):
        """

        Args:
            website: website address (scheme and domain), e.g. https://example.com
            contruct_candidates: keywords for filtering the list of URLs,
                default: see `self.default_contruct_candidates`
            use_default:  if no contactinfo urls found and return filtered_urls,
                default: True
        """
        super().__init__(website)
        self.checked_links = set()
        self.candidates = (
            contruct_candidates
            if contruct_candidates is not None
            else self.default_contruct_candidates
        )
        self.use_default = use_default

    def filter(self, urls: Iterable[str]) -> list[str]:
        """Filter out the links without keywords

        Args:
            urls: List of URLs for filtering

        Returns:
            List of filtered URLs
        """
        filtered_urls = []
        contactinfo_urls = []

        for url in urls:
            url = urljoin(self.website, url)

            if not url.startswith(self.website):
                continue
            if url in self.checked_links:
                continue
            filtered_urls.append(url)
            self.checked_links.add(url)

            for cand in self.candidates:
                if cand in url.lower():
                    contactinfo_urls.append(url)
                    break

        return (
            filtered_urls
            if len(contactinfo_urls) == 0 and self.use_default
            else contactinfo_urls
        )

Parameters:

Name Type Description Default
website str

website address (scheme and domain), e.g. https://example.com

required
contruct_candidates list[str] | None

keywords for filtering the list of URLs, default: see self.default_contruct_candidates

None
use_default bool

if no contactinfo urls found and return filtered_urls, default: True

False
Source code in extract_emails/link_filters/contact_link_filter.py
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
def __init__(
    self,
    website: str,
    contruct_candidates: list[str] | None = None,
    use_default: bool = False,
):
    """

    Args:
        website: website address (scheme and domain), e.g. https://example.com
        contruct_candidates: keywords for filtering the list of URLs,
            default: see `self.default_contruct_candidates`
        use_default:  if no contactinfo urls found and return filtered_urls,
            default: True
    """
    super().__init__(website)
    self.checked_links = set()
    self.candidates = (
        contruct_candidates
        if contruct_candidates is not None
        else self.default_contruct_candidates
    )
    self.use_default = use_default

Filter out the links without keywords

Parameters:

Name Type Description Default
urls Iterable[str]

List of URLs for filtering

required

Returns:

Type Description
list[str]

List of filtered URLs

Source code in extract_emails/link_filters/contact_link_filter.py
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
def filter(self, urls: Iterable[str]) -> list[str]:
    """Filter out the links without keywords

    Args:
        urls: List of URLs for filtering

    Returns:
        List of filtered URLs
    """
    filtered_urls = []
    contactinfo_urls = []

    for url in urls:
        url = urljoin(self.website, url)

        if not url.startswith(self.website):
            continue
        if url in self.checked_links:
            continue
        filtered_urls.append(url)
        self.checked_links.add(url)

        for cand in self.candidates:
            if cand in url.lower():
                contactinfo_urls.append(url)
                break

    return (
        filtered_urls
        if len(contactinfo_urls) == 0 and self.use_default
        else contactinfo_urls
    )