Data Extractors¶

`DataExtractor` ¶

Bases: ABC

Base class for all data extractors

Source code in extract_emails/data_extractors/data_extractor.py

class DataExtractor(ABC):
    """Base class for all data extractors"""

    @property
    @abstractmethod
    def name(self) -> str:
        """Name of the data extractor, e.g. email, linkedin"""

    @abstractmethod
    def get_data(self, page_source: str) -> set[str]:
        """Extract needed data from a string

        Args:
            page_source: webpage content

        Returns:
            Set of data, e.g. {'email@email.com', 'email2@email.com'}
        """

`name` `abstractmethod` `property` ¶

Name of the data extractor, e.g. email, linkedin

`get_data(page_source)` `abstractmethod` ¶

Extract needed data from a string

Parameters:

Name	Type	Description	Default
`page_source`	`str`	webpage content	required

Returns:

Type	Description
`set[str]`	Set of data, e.g. {'email@email.com', 'email2@email.com'}

Source code in extract_emails/data_extractors/data_extractor.py

@abstractmethod
def get_data(self, page_source: str) -> set[str]:
    """Extract needed data from a string

    Args:
        page_source: webpage content

    Returns:
        Set of data, e.g. {'email@email.com', 'email2@email.com'}
    """

`EmailExtractor` ¶

Bases: DataExtractor

Source code in extract_emails/data_extractors/email_extractor.py

class EmailExtractor(DataExtractor):
    def __init__(self):
        self.regexp = re.compile(
            r"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"""
        )

    @property
    def name(self) -> str:
        return "email"

    def get_data(self, page_source: str) -> set[str]:
        """Extract emails from a string

        Args:
            page_source: webpage content

        Returns:
            Set of emails, e.g. {'email@email.com', 'email2@email.com'}
        """
        raw_emails = [i for i in self.regexp.findall(page_source)]
        return email_filter(raw_emails)

`get_data(page_source)` ¶

Extract emails from a string

Parameters:

Name	Type	Description	Default
`page_source`	`str`	webpage content	required

Returns:

Type	Description
`set[str]`	Set of emails, e.g. {'email@email.com', 'email2@email.com'}

Source code in extract_emails/data_extractors/email_extractor.py

def get_data(self, page_source: str) -> set[str]:
    """Extract emails from a string

    Args:
        page_source: webpage content

    Returns:
        Set of emails, e.g. {'email@email.com', 'email2@email.com'}
    """
    raw_emails = [i for i in self.regexp.findall(page_source)]
    return email_filter(raw_emails)

`AdvancedEmailExtractor` ¶

Bases: DataExtractor

Advanced email extractor with support for obfuscated and Cloudflare-protected emails.

Source code in extract_emails/data_extractors/advanced_email_extractor.py

class AdvancedEmailExtractor(DataExtractor):
    """Advanced email extractor with support for obfuscated and Cloudflare-protected emails."""

    def __init__(self):
        # Regex for standard and obfuscated emails
        self.email_pattern = re.compile(
            r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", re.IGNORECASE
        )

    @property
    def name(self) -> str:
        """Name of the data extractor.

        Returns:
            "email"
        """
        return "email"

    def preprocess(self, text: str) -> str:
        """Normalize common obfuscations to standard email format.

        Replaces common email obfuscation patterns like "[at]", "(at)", " at " with "@"
        and "[dot]", "(dot)", " dot " with ".".

        Args:
            text: Text content that may contain obfuscated email addresses.

        Returns:
            Text with obfuscations normalized to standard email format.
        """
        replacements = [
            (r"\[\s*at\s*\]", "@"),
            (r"\(\s*at\s*\)", "@"),
            (r"\s+at\s+", "@"),
            (r"\[\s*dot\s*\]", "."),
            (r"\(\s*dot\s*\)", "."),
            (r"\s+dot\s+", "."),
        ]
        for pattern, repl in replacements:
            text = re.sub(pattern, repl, text, flags=re.IGNORECASE)
        return text

    def cf_decode_email(self, encoded: str) -> str:
        """Decode Cloudflare-protected email from data-cfemail attribute.

        Decodes email addresses that are obfuscated by Cloudflare's email protection
        feature using XOR encryption.

        Args:
            encoded: Hex-encoded email string from data-cfemail attribute.

        Returns:
            Decoded email address.
        """
        key = int(encoded[:2], 16)
        return "".join(
            chr(int(encoded[i : i + 2], 16) ^ key) for i in range(2, len(encoded), 2)
        )

    def is_junk(self, email: str) -> bool:
        """Filter likely junk/system emails.

        Checks if an email address appears to be a system or junk email based on
        various heuristics like length, hex patterns, and known junk domains.

        Args:
            email: Email address to check.

        Returns:
            True if the email appears to be junk, False otherwise.
        """
        local, domain = email.split("@", 1)
        if len(local) > 25:
            return True
        if re.fullmatch(r"[0-9a-f]{8,}", local):
            return True
        junk_domains = ("sentry.wixpress.com", "no-reply.github.com", "mailer-daemon")
        if domain.lower().endswith(junk_domains):
            return True
        return False

    def get_data(self, page_source: str) -> set[str]:
        """Extract emails from a webpage with support for obfuscated and protected emails.

        Extracts email addresses using multiple strategies:
        1. Regex matching on preprocessed text (handles common obfuscations)
        2. Decoding Cloudflare-protected emails from data-cfemail attributes

        Args:
            page_source: HTML webpage content.

        Returns:
            Set of extracted email addresses.
        """
        emails = set()

        # 1. Extract emails from normal text (with preprocessing)
        cleaned_text = self.preprocess(page_source)
        for match in self.email_pattern.findall(cleaned_text):
            if not self.is_junk(match):
                emails.add(match.lower())

        # 2. Extract Cloudflare-obfuscated emails
        doc = html.fromstring(page_source)
        cf_elements = doc.cssselect("[data-cfemail]")
        for elem in cf_elements:
            encoded = elem.get("data-cfemail")
            if encoded:
                decoded = self.cf_decode_email(encoded)
                if not self.is_junk(decoded):
                    emails.add(decoded.lower())

        return email_filter(emails)

`name` `property` ¶

Name of the data extractor.

Returns:

Type	Description
`str`	"email"

`cf_decode_email(encoded)` ¶

Decode Cloudflare-protected email from data-cfemail attribute.

Decodes email addresses that are obfuscated by Cloudflare's email protection feature using XOR encryption.

Parameters:

Name	Type	Description	Default
`encoded`	`str`	Hex-encoded email string from data-cfemail attribute.	required

Returns:

Type	Description
`str`	Decoded email address.

Source code in extract_emails/data_extractors/advanced_email_extractor.py

def cf_decode_email(self, encoded: str) -> str:
    """Decode Cloudflare-protected email from data-cfemail attribute.

    Decodes email addresses that are obfuscated by Cloudflare's email protection
    feature using XOR encryption.

    Args:
        encoded: Hex-encoded email string from data-cfemail attribute.

    Returns:
        Decoded email address.
    """
    key = int(encoded[:2], 16)
    return "".join(
        chr(int(encoded[i : i + 2], 16) ^ key) for i in range(2, len(encoded), 2)
    )

`get_data(page_source)` ¶

Extract emails from a webpage with support for obfuscated and protected emails.

Extracts email addresses using multiple strategies: 1. Regex matching on preprocessed text (handles common obfuscations) 2. Decoding Cloudflare-protected emails from data-cfemail attributes

Parameters:

Name	Type	Description	Default
`page_source`	`str`	HTML webpage content.	required

Returns:

Type	Description
`set[str]`	Set of extracted email addresses.

Source code in extract_emails/data_extractors/advanced_email_extractor.py

def get_data(self, page_source: str) -> set[str]:
    """Extract emails from a webpage with support for obfuscated and protected emails.

    Extracts email addresses using multiple strategies:
    1. Regex matching on preprocessed text (handles common obfuscations)
    2. Decoding Cloudflare-protected emails from data-cfemail attributes

    Args:
        page_source: HTML webpage content.

    Returns:
        Set of extracted email addresses.
    """
    emails = set()

    # 1. Extract emails from normal text (with preprocessing)
    cleaned_text = self.preprocess(page_source)
    for match in self.email_pattern.findall(cleaned_text):
        if not self.is_junk(match):
            emails.add(match.lower())

    # 2. Extract Cloudflare-obfuscated emails
    doc = html.fromstring(page_source)
    cf_elements = doc.cssselect("[data-cfemail]")
    for elem in cf_elements:
        encoded = elem.get("data-cfemail")
        if encoded:
            decoded = self.cf_decode_email(encoded)
            if not self.is_junk(decoded):
                emails.add(decoded.lower())

    return email_filter(emails)

`is_junk(email)` ¶

Filter likely junk/system emails.

Checks if an email address appears to be a system or junk email based on various heuristics like length, hex patterns, and known junk domains.

Parameters:

Name	Type	Description	Default
`email`	`str`	Email address to check.	required

Returns:

Type	Description
`bool`	True if the email appears to be junk, False otherwise.

Source code in extract_emails/data_extractors/advanced_email_extractor.py

def is_junk(self, email: str) -> bool:
    """Filter likely junk/system emails.

    Checks if an email address appears to be a system or junk email based on
    various heuristics like length, hex patterns, and known junk domains.

    Args:
        email: Email address to check.

    Returns:
        True if the email appears to be junk, False otherwise.
    """
    local, domain = email.split("@", 1)
    if len(local) > 25:
        return True
    if re.fullmatch(r"[0-9a-f]{8,}", local):
        return True
    junk_domains = ("sentry.wixpress.com", "no-reply.github.com", "mailer-daemon")
    if domain.lower().endswith(junk_domains):
        return True
    return False

`preprocess(text)` ¶

Normalize common obfuscations to standard email format.

Replaces common email obfuscation patterns like "[at]", "(at)", " at " with "@" and "[dot]", "(dot)", " dot " with ".".

Parameters:

Name	Type	Description	Default
`text`	`str`	Text content that may contain obfuscated email addresses.	required

Returns:

Type	Description
`str`	Text with obfuscations normalized to standard email format.

Source code in extract_emails/data_extractors/advanced_email_extractor.py

def preprocess(self, text: str) -> str:
    """Normalize common obfuscations to standard email format.

    Replaces common email obfuscation patterns like "[at]", "(at)", " at " with "@"
    and "[dot]", "(dot)", " dot " with ".".

    Args:
        text: Text content that may contain obfuscated email addresses.

    Returns:
        Text with obfuscations normalized to standard email format.
    """
    replacements = [
        (r"\[\s*at\s*\]", "@"),
        (r"\(\s*at\s*\)", "@"),
        (r"\s+at\s+", "@"),
        (r"\[\s*dot\s*\]", "."),
        (r"\(\s*dot\s*\)", "."),
        (r"\s+dot\s+", "."),
    ]
    for pattern, repl in replacements:
        text = re.sub(pattern, repl, text, flags=re.IGNORECASE)
    return text

`LinkedinExtractor` ¶

Bases: DataExtractor

Source code in extract_emails/data_extractors/linkedin_extractor.py

class LinkedinExtractor(DataExtractor):
    def __init__(self):
        self.regexp = re.compile(
            r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
        )

    @property
    def name(self) -> str:
        return "linkedin"

    def get_data(self, page_source: str) -> set[str]:
        """Extract links to Linkedin profiles

        Args:
            page_source: webpage content

        Returns:
            Set of urls, e.g. {'https://www.linkedin.com/in/venjamin-brant-73381ujy3u'}
        """
        all_urls = self.regexp.findall(page_source)
        url_filter = "linkedin.com/in/"
        linkedin_urls = set([i[0] for i in all_urls if url_filter in i[0]])
        return linkedin_urls

`get_data(page_source)` ¶

Extract links to Linkedin profiles

Parameters:

Name	Type	Description	Default
`page_source`	`str`	webpage content	required

Returns:

Type	Description
`set[str]`	Set of urls, e.g. {'https://www.linkedin.com/in/venjamin-brant-73381ujy3u'}

Source code in extract_emails/data_extractors/linkedin_extractor.py

def get_data(self, page_source: str) -> set[str]:
    """Extract links to Linkedin profiles

    Args:
        page_source: webpage content

    Returns:
        Set of urls, e.g. {'https://www.linkedin.com/in/venjamin-brant-73381ujy3u'}
    """
    all_urls = self.regexp.findall(page_source)
    url_filter = "linkedin.com/in/"
    linkedin_urls = set([i[0] for i in all_urls if url_filter in i[0]])
    return linkedin_urls

Data Extractors¶

DataExtractor ¶

name abstractmethod property ¶

get_data(page_source) abstractmethod ¶

EmailExtractor ¶

get_data(page_source) ¶

AdvancedEmailExtractor ¶

name property ¶

cf_decode_email(encoded) ¶

get_data(page_source) ¶

is_junk(email) ¶

preprocess(text) ¶

LinkedinExtractor ¶

get_data(page_source) ¶

`DataExtractor` ¶

`name` `abstractmethod` `property` ¶

`get_data(page_source)` `abstractmethod` ¶

`EmailExtractor` ¶

`get_data(page_source)` ¶

`AdvancedEmailExtractor` ¶

`name` `property` ¶

`cf_decode_email(encoded)` ¶

`get_data(page_source)` ¶

`is_junk(email)` ¶

`preprocess(text)` ¶

`LinkedinExtractor` ¶

`get_data(page_source)` ¶