Skip to content

Factories

BaseFactory

Base class for all factories

data_extractors: List[Type[extract_emails.data_extractors.data_extractor.DataExtractor]] property readonly

Initialize data extractors

Initialize link filter

__init__(self, *, website_url, browser, depth=None, max_links_from_page=None) special

Parameters:

Name Type Description Default
website_url str

website for scan, e.g. https://example.com

required
browser PageSourceGetter

browser to get page source by URL

required
depth Optional[int]

scan's depth, default 10

None
max_links_from_page Optional[int]

how many links a script shall get from each page, default None (all)

None
Source code in extract_emails/factories/base_factory.py
def __init__(
    self,
    *,
    website_url: str,
    browser: PageSourceGetter,
    depth: Optional[int] = None,
    max_links_from_page: Optional[int] = None,
):
    """
    Args:
        website_url: website for scan, e.g. https://example.com
        browser: browser to get page source by URL
        depth: scan's depth, default 10
        max_links_from_page: how many links a script shall get from each page, default None (all)
    """
    self._website_url = website_url
    self._browser = browser
    self._depth = depth
    self._max_links_from_page = max_links_from_page

DefaultFilterAndEmailFactory

Will initialize DefaultLinkFilter and EmailExtractor

Parameters:

Name Type Description Default
website_url str

website for scan, e.g. https://example.com

required
browser PageSourceGetter

browser to get page source by URL

required
depth Optional[int]

scan's depth, default 10. Defaults to None

required
max_links_from_page Optional[int]

how many links a script shall get from each page. Defaults to None

required

Examples:

>>> from extract_emails import DefaultFilterAndEmailFactory as Factory
>>> from extract_emails import DefaultWorker
>>> from extract_emails.browsers.requests_browser import RequestsBrowser as Browser
>>>
>>> browser = Browser()
>>> url = 'https://en.wikipedia.org/'
>>> factory = Factory(website_url=url, browser=browser)
>>> worker = DefaultWorker(factory)
>>> data = worker.get_data()
>>> data
    [
        PageData(
            website='https://en.wikipedia.org/',
            page_url='https://en.wikipedia.org/Email_address',
            data={'email': ['"John.Doe."@example.com', 'x@example.com']}
        ),
        PageData(
            website='https://en.wikipedia.org/',
            page_url='https://en.wikipedia.org/Email_address2',
            data={'email': ['"John.Doe2."@example.com', 'x2@example.com']}
        ),
    ]

data_extractors: List[extract_emails.data_extractors.email_extractor.EmailExtractor] property readonly

Initialize EmailExtractor

Initialize DefaultLinkFilter

DefaultFilterAndLinkedinFactory

Will initialize DefaultLinkFilter and LinkedinExtractor

Parameters:

Name Type Description Default
website_url str

website for scan, e.g. https://example.com

required
browser PageSourceGetter

browser to get page source by URL

required
depth Optional[int]

scan's depth, default 10. Defaults to None

required
max_links_from_page Optional[int]

how many links a script shall get from each page. Defaults to None

required

Examples:

>>> from extract_emails import DefaultFilterAndLinkedinFactory as Factory
>>> from extract_emails import DefaultWorker
>>> from extract_emails.browsers.requests_browser import RequestsBrowser as Browser
>>>
>>> browser = Browser()
>>> url = 'https://en.wikipedia.org/'
>>> factory = Factory(website_url=url, browser=browser)
>>> worker = DefaultWorker(factory)
>>> data = worker.get_data()
>>> data
    [
        PageData(
            website='https://en.wikipedia.org/',
            page_url='https://en.wikipedia.org/Email_address',
            data={'linkedin': ['linkeding profile url 1', 'linkeding profile url 2']}
        ),
        PageData(
            website='https://en.wikipedia.org/',
            page_url='https://en.wikipedia.org/Email_address2',
            data={'linkedin': ['linkeding profile url 3', 'linkeding profile url 4']}
        ),
    ]

data_extractors: List[extract_emails.data_extractors.linkedin_extractor.LinkedinExtractor] property readonly

Initialize LinkedinExtractor

Initialize DefaultLinkFilter

DefaultFilterAndEmailAndLinkedinFactory

Will initialize DefaultLinkFilter and EmailExtractor and LinkedinExtractor

Parameters:

Name Type Description Default
website_url str

website for scan, e.g. https://example.com

required
browser PageSourceGetter

browser to get page source by URL

required
depth Optional[int]

scan's depth, default 10. Defaults to None

required
max_links_from_page Optional[int]

how many links a script shall get from each page. Defaults to None

required

Examples:

>>> from extract_emails import DefaultFilterAndEmailAndLinkedinFactory as Factory
>>> from extract_emails import DefaultWorker
>>> from extract_emails.browsers.requests_browser import RequestsBrowser as Browser
>>>
>>> browser = Browser()
>>> url = 'https://en.wikipedia.org/'
>>> factory = Factory(website_url=url, browser=browser)
>>> worker = DefaultWorker(factory)
>>> data = worker.get_data()
>>> data
    [
        PageData(
            website='https://en.wikipedia.org/',
            page_url='https://en.wikipedia.org/Email_address',
            data={
            'email': ['"John.Doe."@example.com', 'x@example.com'],
            'linkedin': ['linkedin_url1', 'linkedin_url2'],
            }
        ),
        PageData(
            website='https://en.wikipedia.org/',
            page_url='https://en.wikipedia.org/Email_address2',
            data={
            'email': ['"John.Doe."@example.com', 'x@example.com'],
            'linkedin': ['linkedin_url3', 'linkedin_url4'],
            }
        ),
    ]

data_extractors: List[Union[extract_emails.data_extractors.email_extractor.EmailExtractor, extract_emails.data_extractors.linkedin_extractor.LinkedinExtractor]] property readonly

Initialize EmailExtractor and LinkedinExtractor

Initialize DefaultLinkFilter

ContactFilterAndEmailFactory

Will initialize ContactInfoLinkFilter and EmailExtractor

Parameters:

Name Type Description Default
website_url str

website for scan, e.g. https://example.com

required
browser PageSourceGetter

browser to get page source by URL

required
depth Optional[int]

scan's depth, default 10. Defaults to None

required
max_links_from_page Optional[int]

how many links a script shall get from each page. Defaults to None

required

Examples:

>>> from extract_emails import ContactFilterAndEmailFactory as Factory
>>> from extract_emails import DefaultWorker
>>> from extract_emails.browsers.requests_browser import RequestsBrowser as Browser
>>>
>>> browser = Browser()
>>> url = 'https://en.wikipedia.org/'
>>> factory = Factory(website_url=url, browser=browser)
>>> worker = DefaultWorker(factory)
>>> data = worker.get_data()
>>> data
    [
        PageData(
            website='https://en.wikipedia.org/',
            page_url='https://en.wikipedia.org/Email_address',
            data={'email': ['"John.Doe."@example.com', 'x@example.com']}
        ),
        PageData(
            website='https://en.wikipedia.org/',
            page_url='https://en.wikipedia.org/Email_address2',
            data={'email': ['"John.Doe2."@example.com', 'x2@example.com']}
        ),
    ]

data_extractors: List[extract_emails.data_extractors.email_extractor.EmailExtractor] property readonly

Initialize EmailExtractor

Initialize ContactInfoLinkFilter

ContactFilterAndLinkedinFactory

Will initialize ContactInfoLinkFilter and LinkedinExtractor

Parameters:

Name Type Description Default
website_url str

website for scan, e.g. https://example.com

required
browser PageSourceGetter

browser to get page source by URL

required
depth Optional[int]

scan's depth, default 10. Defaults to None

required
max_links_from_page Optional[int]

how many links a script shall get from each page. Defaults to None

required

Examples:

>>> from extract_emails import ContactFilterAndLinkedinFactory as Factory
>>> from extract_emails import DefaultWorker
>>> from extract_emails.browsers.requests_browser import RequestsBrowser as Browser
>>>
>>> browser = Browser()
>>> url = 'https://en.wikipedia.org/'
>>> factory = Factory(website_url=url, browser=browser)
>>> worker = DefaultWorker(factory)
>>> data = worker.get_data()
>>> data
    [
        PageData(
            website='https://en.wikipedia.org/',
            page_url='https://en.wikipedia.org/Email_address',
            data={'linkedin': ['linkeding profile url 1', 'linkeding profile url 2']}
        ),
        PageData(
            website='https://en.wikipedia.org/',
            page_url='https://en.wikipedia.org/Email_address2',
            data={'linkedin': ['linkeding profile url 3', 'linkeding profile url 4']}
        ),
    ]

data_extractors: List[extract_emails.data_extractors.linkedin_extractor.LinkedinExtractor] property readonly

Initialize LinkedinExtractor

Initialize ContactInfoLinkFilter

ContactFilterAndEmailAndLinkedinFactory

Will initialize ContactInfoLinkFilter and EmailExtractor and LinkedinExtractor

Parameters:

Name Type Description Default
website_url str

website for scan, e.g. https://example.com

required
browser PageSourceGetter

browser to get page source by URL

required
depth Optional[int]

scan's depth, default 10. Defaults to None

required
max_links_from_page Optional[int]

how many links a script shall get from each page. Defaults to None

required

Examples:

>>> from extract_emails import ContactFilterAndEmailAndLinkedinFactory as Factory
>>> from extract_emails import DefaultWorker
>>> from extract_emails.browsers.requests_browser import RequestsBrowser as Browser
>>>
>>> browser = Browser()
>>> url = 'https://en.wikipedia.org/'
>>> factory = Factory(website_url=url, browser=browser)
>>> worker = DefaultWorker(factory)
>>> data = worker.get_data()
>>> data
    [
        PageData(
            website='https://en.wikipedia.org/',
            page_url='https://en.wikipedia.org/Email_address',
            data={
            'email': ['"John.Doe."@example.com', 'x@example.com'],
            'linkedin': ['linkedin_url1', 'linkedin_url2'],
            }
        ),
        PageData(
            website='https://en.wikipedia.org/',
            page_url='https://en.wikipedia.org/Email_address2',
            data={
            'email': ['"John.Doe."@example.com', 'x@example.com'],
            'linkedin': ['linkedin_url3', 'linkedin_url4'],
            }
        ),
    ]

data_extractors: List[Union[extract_emails.data_extractors.email_extractor.EmailExtractor, extract_emails.data_extractors.linkedin_extractor.LinkedinExtractor]] property readonly

Initialize EmailExtractor and LinkedinExtractor

Initialize DefaultLinkFilter