Data Extractors¶

`DataExtractor` ¶

Bases: ABC

Base class for all data extractors

Source code in extract_emails/data_extractors/data_extractor.py

class DataExtractor(ABC):
    """Base class for all data extractors"""

    @property
    @abstractmethod
    def name(self) -> str:
        """Name of the data extractor, e.g. email, linkedin"""

    @abstractmethod
    def get_data(self, page_source: str) -> set[str]:
        """Extract needed data from a string

        Args:
            page_source: webpage content

        Returns:
            Set of data, e.g. {'email@email.com', 'email2@email.com'}
        """

`name` `abstractmethod` `property` ¶

Name of the data extractor, e.g. email, linkedin

`get_data(page_source)` `abstractmethod` ¶

Extract needed data from a string

Parameters:

Name	Type	Description	Default
`page_source`	`str`	webpage content	required

Returns:

Type	Description
`set[str]`	Set of data, e.g. {'email@email.com', 'email2@email.com'}

Source code in extract_emails/data_extractors/data_extractor.py

@abstractmethod
def get_data(self, page_source: str) -> set[str]:
    """Extract needed data from a string

    Args:
        page_source: webpage content

    Returns:
        Set of data, e.g. {'email@email.com', 'email2@email.com'}
    """

`EmailExtractor` ¶

Bases: DataExtractor

Source code in extract_emails/data_extractors/email_extractor.py

class EmailExtractor(DataExtractor):
    def __init__(self):
        self.regexp = re.compile(
            r"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"""
        )

    @property
    def name(self) -> str:
        return "email"

    def get_data(self, page_source: str) -> set[str]:
        """Extract emails from a string

        Args:
            page_source: webpage content

        Returns:
            Set of emails, e.g. {'email@email.com', 'email2@email.com'}
        """
        raw_emails = [i for i in self.regexp.findall(page_source)]
        return email_filter(raw_emails)

`get_data(page_source)` ¶

Extract emails from a string

Parameters:

Name	Type	Description	Default
`page_source`	`str`	webpage content	required

Returns:

Type	Description
`set[str]`	Set of emails, e.g. {'email@email.com', 'email2@email.com'}

Source code in extract_emails/data_extractors/email_extractor.py

def get_data(self, page_source: str) -> set[str]:
    """Extract emails from a string

    Args:
        page_source: webpage content

    Returns:
        Set of emails, e.g. {'email@email.com', 'email2@email.com'}
    """
    raw_emails = [i for i in self.regexp.findall(page_source)]
    return email_filter(raw_emails)

`LinkedinExtractor` ¶

Bases: DataExtractor

Source code in extract_emails/data_extractors/linkedin_extractor.py

class LinkedinExtractor(DataExtractor):
    def __init__(self):
        self.regexp = re.compile(
            r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
        )

    @property
    def name(self) -> str:
        return "linkedin"

    def get_data(self, page_source: str) -> set[str]:
        """Extract links to Linkedin profiles

        Args:
            page_source: webpage content

        Returns:
            Set of urls, e.g. {'https://www.linkedin.com/in/venjamin-brant-73381ujy3u'}
        """
        all_urls = self.regexp.findall(page_source)
        url_filter = "linkedin.com/in/"
        linkedin_urls = set([i[0] for i in all_urls if url_filter in i[0]])
        return linkedin_urls

`get_data(page_source)` ¶

Extract links to Linkedin profiles

Parameters:

Name	Type	Description	Default
`page_source`	`str`	webpage content	required

Returns:

Type	Description
`set[str]`	Set of urls, e.g. {'https://www.linkedin.com/in/venjamin-brant-73381ujy3u'}

Source code in extract_emails/data_extractors/linkedin_extractor.py

def get_data(self, page_source: str) -> set[str]:
    """Extract links to Linkedin profiles

    Args:
        page_source: webpage content

    Returns:
        Set of urls, e.g. {'https://www.linkedin.com/in/venjamin-brant-73381ujy3u'}
    """
    all_urls = self.regexp.findall(page_source)
    url_filter = "linkedin.com/in/"
    linkedin_urls = set([i[0] for i in all_urls if url_filter in i[0]])
    return linkedin_urls

Data Extractors¶

DataExtractor ¶

name abstractmethod property ¶

get_data(page_source) abstractmethod ¶

EmailExtractor ¶

get_data(page_source) ¶

LinkedinExtractor ¶

get_data(page_source) ¶

`DataExtractor` ¶

`name` `abstractmethod` `property` ¶

`get_data(page_source)` `abstractmethod` ¶

`EmailExtractor` ¶

`get_data(page_source)` ¶

`LinkedinExtractor` ¶

`get_data(page_source)` ¶