Skip to content

Data Extractors

DataExtractor

Bases: ABC

Base class for all data extractors

Source code in extract_emails/data_extractors/data_extractor.py
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
class DataExtractor(ABC):
    """Base class for all data extractors"""

    @property
    @abstractmethod
    def name(self) -> str:
        """Name of the data extractor, e.g. email, linkedin"""

    @abstractmethod
    def get_data(self, page_source: str) -> set[str]:
        """Extract needed data from a string

        Args:
            page_source: webpage content

        Returns:
            Set of data, e.g. {'email@email.com', 'email2@email.com'}
        """

name abstractmethod property

Name of the data extractor, e.g. email, linkedin

get_data(page_source) abstractmethod

Extract needed data from a string

Parameters:

Name Type Description Default
page_source str

webpage content

required

Returns:

Type Description
set[str]

Set of data, e.g. {'email@email.com', 'email2@email.com'}

Source code in extract_emails/data_extractors/data_extractor.py
12
13
14
15
16
17
18
19
20
21
@abstractmethod
def get_data(self, page_source: str) -> set[str]:
    """Extract needed data from a string

    Args:
        page_source: webpage content

    Returns:
        Set of data, e.g. {'email@email.com', 'email2@email.com'}
    """

EmailExtractor

Bases: DataExtractor

Source code in extract_emails/data_extractors/email_extractor.py
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
class EmailExtractor(DataExtractor):
    def __init__(self):
        self.regexp = re.compile(
            r"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"""
        )

    @property
    def name(self) -> str:
        return "email"

    def get_data(self, page_source: str) -> set[str]:
        """Extract emails from a string

        Args:
            page_source: webpage content

        Returns:
            Set of emails, e.g. {'email@email.com', 'email2@email.com'}
        """
        raw_emails = [i for i in self.regexp.findall(page_source)]
        return email_filter(raw_emails)

get_data(page_source)

Extract emails from a string

Parameters:

Name Type Description Default
page_source str

webpage content

required

Returns:

Type Description
set[str]

Set of emails, e.g. {'email@email.com', 'email2@email.com'}

Source code in extract_emails/data_extractors/email_extractor.py
17
18
19
20
21
22
23
24
25
26
27
def get_data(self, page_source: str) -> set[str]:
    """Extract emails from a string

    Args:
        page_source: webpage content

    Returns:
        Set of emails, e.g. {'email@email.com', 'email2@email.com'}
    """
    raw_emails = [i for i in self.regexp.findall(page_source)]
    return email_filter(raw_emails)

LinkedinExtractor

Bases: DataExtractor

Source code in extract_emails/data_extractors/linkedin_extractor.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
class LinkedinExtractor(DataExtractor):
    def __init__(self):
        self.regexp = re.compile(
            r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
        )

    @property
    def name(self) -> str:
        return "linkedin"

    def get_data(self, page_source: str) -> set[str]:
        """Extract links to Linkedin profiles

        Args:
            page_source: webpage content

        Returns:
            Set of urls, e.g. {'https://www.linkedin.com/in/venjamin-brant-73381ujy3u'}
        """
        all_urls = self.regexp.findall(page_source)
        url_filter = "linkedin.com/in/"
        linkedin_urls = set([i[0] for i in all_urls if url_filter in i[0]])
        return linkedin_urls

get_data(page_source)

Extract links to Linkedin profiles

Parameters:

Name Type Description Default
page_source str

webpage content

required

Returns:

Type Description
set[str]
Source code in extract_emails/data_extractors/linkedin_extractor.py
16
17
18
19
20
21
22
23
24
25
26
27
28
def get_data(self, page_source: str) -> set[str]:
    """Extract links to Linkedin profiles

    Args:
        page_source: webpage content

    Returns:
        Set of urls, e.g. {'https://www.linkedin.com/in/venjamin-brant-73381ujy3u'}
    """
    all_urls = self.regexp.findall(page_source)
    url_filter = "linkedin.com/in/"
    linkedin_urls = set([i[0] for i in all_urls if url_filter in i[0]])
    return linkedin_urls