Skip to content

Data Extractors

DataExtractor

Bases: ABC

Base class for all data extractors

Source code in extract_emails/data_extractors/data_extractor.py
 4
 5
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
class DataExtractor(ABC):
    """Base class for all data extractors"""

    @property
    @abstractmethod
    def name(self) -> str:
        """Name of the data extractor, e.g. email, linkedin"""

    @abstractmethod
    def get_data(self, page_source: str) -> set[str]:
        """Extract needed data from a string

        Args:
            page_source: webpage content

        Returns:
            Set of data, e.g. {'email@email.com', 'email2@email.com'}
        """

name abstractmethod property

Name of the data extractor, e.g. email, linkedin

get_data(page_source) abstractmethod

Extract needed data from a string

Parameters:

Name Type Description Default
page_source str

webpage content

required

Returns:

Type Description
set[str]

Set of data, e.g. {'email@email.com', 'email2@email.com'}

Source code in extract_emails/data_extractors/data_extractor.py
12
13
14
15
16
17
18
19
20
21
@abstractmethod
def get_data(self, page_source: str) -> set[str]:
    """Extract needed data from a string

    Args:
        page_source: webpage content

    Returns:
        Set of data, e.g. {'email@email.com', 'email2@email.com'}
    """

EmailExtractor

Bases: DataExtractor

Source code in extract_emails/data_extractors/email_extractor.py
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
class EmailExtractor(DataExtractor):
    def __init__(self):
        self.regexp = re.compile(
            r"""(?:[a-z0-9!#$%&'*+/=?^_`{|}~-]+(?:\.[a-z0-9!#$%&'*+/=?^_`{|}~-]+)*|"(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21\x23-\x5b\x5d-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])*")@(?:(?:[a-z0-9](?:[a-z0-9-]*[a-z0-9])?\.)+[a-z0-9](?:[a-z0-9-]*[a-z0-9])?|\[(?:(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\.){3}(?:25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?|[a-z0-9-]*[a-z0-9]:(?:[\x01-\x08\x0b\x0c\x0e-\x1f\x21-\x5a\x53-\x7f]|\\[\x01-\x09\x0b\x0c\x0e-\x7f])+)\])"""
        )

    @property
    def name(self) -> str:
        return "email"

    def get_data(self, page_source: str) -> set[str]:
        """Extract emails from a string

        Args:
            page_source: webpage content

        Returns:
            Set of emails, e.g. {'email@email.com', 'email2@email.com'}
        """
        raw_emails = [i for i in self.regexp.findall(page_source)]
        return email_filter(raw_emails)

get_data(page_source)

Extract emails from a string

Parameters:

Name Type Description Default
page_source str

webpage content

required

Returns:

Type Description
set[str]

Set of emails, e.g. {'email@email.com', 'email2@email.com'}

Source code in extract_emails/data_extractors/email_extractor.py
17
18
19
20
21
22
23
24
25
26
27
def get_data(self, page_source: str) -> set[str]:
    """Extract emails from a string

    Args:
        page_source: webpage content

    Returns:
        Set of emails, e.g. {'email@email.com', 'email2@email.com'}
    """
    raw_emails = [i for i in self.regexp.findall(page_source)]
    return email_filter(raw_emails)

AdvancedEmailExtractor

Bases: DataExtractor

Advanced email extractor with support for obfuscated and Cloudflare-protected emails.

Source code in extract_emails/data_extractors/advanced_email_extractor.py
  9
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
class AdvancedEmailExtractor(DataExtractor):
    """Advanced email extractor with support for obfuscated and Cloudflare-protected emails."""

    def __init__(self):
        # Regex for standard and obfuscated emails
        self.email_pattern = re.compile(
            r"[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}", re.IGNORECASE
        )

    @property
    def name(self) -> str:
        """Name of the data extractor.

        Returns:
            "email"
        """
        return "email"

    def preprocess(self, text: str) -> str:
        """Normalize common obfuscations to standard email format.

        Replaces common email obfuscation patterns like "[at]", "(at)", " at " with "@"
        and "[dot]", "(dot)", " dot " with ".".

        Args:
            text: Text content that may contain obfuscated email addresses.

        Returns:
            Text with obfuscations normalized to standard email format.
        """
        replacements = [
            (r"\[\s*at\s*\]", "@"),
            (r"\(\s*at\s*\)", "@"),
            (r"\s+at\s+", "@"),
            (r"\[\s*dot\s*\]", "."),
            (r"\(\s*dot\s*\)", "."),
            (r"\s+dot\s+", "."),
        ]
        for pattern, repl in replacements:
            text = re.sub(pattern, repl, text, flags=re.IGNORECASE)
        return text

    def cf_decode_email(self, encoded: str) -> str:
        """Decode Cloudflare-protected email from data-cfemail attribute.

        Decodes email addresses that are obfuscated by Cloudflare's email protection
        feature using XOR encryption.

        Args:
            encoded: Hex-encoded email string from data-cfemail attribute.

        Returns:
            Decoded email address.
        """
        key = int(encoded[:2], 16)
        return "".join(
            chr(int(encoded[i : i + 2], 16) ^ key) for i in range(2, len(encoded), 2)
        )

    def is_junk(self, email: str) -> bool:
        """Filter likely junk/system emails.

        Checks if an email address appears to be a system or junk email based on
        various heuristics like length, hex patterns, and known junk domains.

        Args:
            email: Email address to check.

        Returns:
            True if the email appears to be junk, False otherwise.
        """
        local, domain = email.split("@", 1)
        if len(local) > 25:
            return True
        if re.fullmatch(r"[0-9a-f]{8,}", local):
            return True
        junk_domains = ("sentry.wixpress.com", "no-reply.github.com", "mailer-daemon")
        if domain.lower().endswith(junk_domains):
            return True
        return False

    def get_data(self, page_source: str) -> set[str]:
        """Extract emails from a webpage with support for obfuscated and protected emails.

        Extracts email addresses using multiple strategies:
        1. Regex matching on preprocessed text (handles common obfuscations)
        2. Decoding Cloudflare-protected emails from data-cfemail attributes

        Args:
            page_source: HTML webpage content.

        Returns:
            Set of extracted email addresses.
        """
        emails = set()

        # 1. Extract emails from normal text (with preprocessing)
        cleaned_text = self.preprocess(page_source)
        for match in self.email_pattern.findall(cleaned_text):
            if not self.is_junk(match):
                emails.add(match.lower())

        # 2. Extract Cloudflare-obfuscated emails
        doc = html.fromstring(page_source)
        cf_elements = doc.cssselect("[data-cfemail]")
        for elem in cf_elements:
            encoded = elem.get("data-cfemail")
            if encoded:
                decoded = self.cf_decode_email(encoded)
                if not self.is_junk(decoded):
                    emails.add(decoded.lower())

        return email_filter(emails)

name property

Name of the data extractor.

Returns:

Type Description
str

"email"

cf_decode_email(encoded)

Decode Cloudflare-protected email from data-cfemail attribute.

Decodes email addresses that are obfuscated by Cloudflare's email protection feature using XOR encryption.

Parameters:

Name Type Description Default
encoded str

Hex-encoded email string from data-cfemail attribute.

required

Returns:

Type Description
str

Decoded email address.

Source code in extract_emails/data_extractors/advanced_email_extractor.py
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
def cf_decode_email(self, encoded: str) -> str:
    """Decode Cloudflare-protected email from data-cfemail attribute.

    Decodes email addresses that are obfuscated by Cloudflare's email protection
    feature using XOR encryption.

    Args:
        encoded: Hex-encoded email string from data-cfemail attribute.

    Returns:
        Decoded email address.
    """
    key = int(encoded[:2], 16)
    return "".join(
        chr(int(encoded[i : i + 2], 16) ^ key) for i in range(2, len(encoded), 2)
    )

get_data(page_source)

Extract emails from a webpage with support for obfuscated and protected emails.

Extracts email addresses using multiple strategies: 1. Regex matching on preprocessed text (handles common obfuscations) 2. Decoding Cloudflare-protected emails from data-cfemail attributes

Parameters:

Name Type Description Default
page_source str

HTML webpage content.

required

Returns:

Type Description
set[str]

Set of extracted email addresses.

Source code in extract_emails/data_extractors/advanced_email_extractor.py
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
def get_data(self, page_source: str) -> set[str]:
    """Extract emails from a webpage with support for obfuscated and protected emails.

    Extracts email addresses using multiple strategies:
    1. Regex matching on preprocessed text (handles common obfuscations)
    2. Decoding Cloudflare-protected emails from data-cfemail attributes

    Args:
        page_source: HTML webpage content.

    Returns:
        Set of extracted email addresses.
    """
    emails = set()

    # 1. Extract emails from normal text (with preprocessing)
    cleaned_text = self.preprocess(page_source)
    for match in self.email_pattern.findall(cleaned_text):
        if not self.is_junk(match):
            emails.add(match.lower())

    # 2. Extract Cloudflare-obfuscated emails
    doc = html.fromstring(page_source)
    cf_elements = doc.cssselect("[data-cfemail]")
    for elem in cf_elements:
        encoded = elem.get("data-cfemail")
        if encoded:
            decoded = self.cf_decode_email(encoded)
            if not self.is_junk(decoded):
                emails.add(decoded.lower())

    return email_filter(emails)

is_junk(email)

Filter likely junk/system emails.

Checks if an email address appears to be a system or junk email based on various heuristics like length, hex patterns, and known junk domains.

Parameters:

Name Type Description Default
email str

Email address to check.

required

Returns:

Type Description
bool

True if the email appears to be junk, False otherwise.

Source code in extract_emails/data_extractors/advanced_email_extractor.py
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
def is_junk(self, email: str) -> bool:
    """Filter likely junk/system emails.

    Checks if an email address appears to be a system or junk email based on
    various heuristics like length, hex patterns, and known junk domains.

    Args:
        email: Email address to check.

    Returns:
        True if the email appears to be junk, False otherwise.
    """
    local, domain = email.split("@", 1)
    if len(local) > 25:
        return True
    if re.fullmatch(r"[0-9a-f]{8,}", local):
        return True
    junk_domains = ("sentry.wixpress.com", "no-reply.github.com", "mailer-daemon")
    if domain.lower().endswith(junk_domains):
        return True
    return False

preprocess(text)

Normalize common obfuscations to standard email format.

Replaces common email obfuscation patterns like "[at]", "(at)", " at " with "@" and "[dot]", "(dot)", " dot " with ".".

Parameters:

Name Type Description Default
text str

Text content that may contain obfuscated email addresses.

required

Returns:

Type Description
str

Text with obfuscations normalized to standard email format.

Source code in extract_emails/data_extractors/advanced_email_extractor.py
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
def preprocess(self, text: str) -> str:
    """Normalize common obfuscations to standard email format.

    Replaces common email obfuscation patterns like "[at]", "(at)", " at " with "@"
    and "[dot]", "(dot)", " dot " with ".".

    Args:
        text: Text content that may contain obfuscated email addresses.

    Returns:
        Text with obfuscations normalized to standard email format.
    """
    replacements = [
        (r"\[\s*at\s*\]", "@"),
        (r"\(\s*at\s*\)", "@"),
        (r"\s+at\s+", "@"),
        (r"\[\s*dot\s*\]", "."),
        (r"\(\s*dot\s*\)", "."),
        (r"\s+dot\s+", "."),
    ]
    for pattern, repl in replacements:
        text = re.sub(pattern, repl, text, flags=re.IGNORECASE)
    return text

LinkedinExtractor

Bases: DataExtractor

Source code in extract_emails/data_extractors/linkedin_extractor.py
 6
 7
 8
 9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
class LinkedinExtractor(DataExtractor):
    def __init__(self):
        self.regexp = re.compile(
            r"(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))"
        )

    @property
    def name(self) -> str:
        return "linkedin"

    def get_data(self, page_source: str) -> set[str]:
        """Extract links to Linkedin profiles

        Args:
            page_source: webpage content

        Returns:
            Set of urls, e.g. {'https://www.linkedin.com/in/venjamin-brant-73381ujy3u'}
        """
        all_urls = self.regexp.findall(page_source)
        url_filter = "linkedin.com/in/"
        linkedin_urls = set([i[0] for i in all_urls if url_filter in i[0]])
        return linkedin_urls

get_data(page_source)

Extract links to Linkedin profiles

Parameters:

Name Type Description Default
page_source str

webpage content

required

Returns:

Type Description
set[str]
Source code in extract_emails/data_extractors/linkedin_extractor.py
16
17
18
19
20
21
22
23
24
25
26
27
28
def get_data(self, page_source: str) -> set[str]:
    """Extract links to Linkedin profiles

    Args:
        page_source: webpage content

    Returns:
        Set of urls, e.g. {'https://www.linkedin.com/in/venjamin-brant-73381ujy3u'}
    """
    all_urls = self.regexp.findall(page_source)
    url_filter = "linkedin.com/in/"
    linkedin_urls = set([i[0] for i in all_urls if url_filter in i[0]])
    return linkedin_urls