Skip to content

Browsers

PageSourceGetter

All browsers must inherit from this class

get_page_source(self, url)

Return page content from an URL

Parameters:

Name Type Description Default
url str

URL

required

Returns:

Type Description
str

page content (html, json, whatever)

Source code in extract_emails/browsers/page_source_getter.py
@abstractmethod
def get_page_source(self, url: str) -> str:
    """Return page content from an URL

    Args:
        url: URL

    Returns:
        page content (html, json, whatever)
    """

ChromeBrowser

Getting page sources with selenium and chromedriver

Examples:

>>> from extract_emails.browsers.chrome_browser import ChromeBrowser
>>> browser = ChromeBrowser()
>>> browser.open()
>>> page_source = browser.get_page_source('https://example.com')
>>> browser.close()
>>> from extract_emails.browsers.chrome_browser import ChromeBrowser
>>> with ChromeBrowser() as browser:
...     page_source = browser.get_page_source('https://example.com')

__init__(self, executable_path='/usr/bin/chromedriver', headless_mode=True, options=None) special

ChromeBrowser initialization

Parameters:

Name Type Description Default
executable_path PathLike

path to chromedriver, use which chromedriver to get the path. Default: /usr/bin/chromedriver

'/usr/bin/chromedriver'
headless_mode bool

run browser with headless mode or not. Default: True

True
options Iterable[str]

arguments for chrome.Options(). Default: set("--disable-gpu", "--disable-software-rasterizer", "--disable-dev-shm-usage", "--window-size=1920x1080", "--disable-setuid-sandbox", "--no-sandbox", )

None
Source code in extract_emails/browsers/chrome_browser.py
def __init__(
    self,
    executable_path: PathLike = "/usr/bin/chromedriver",
    headless_mode: bool = True,
    options: Iterable[str] = None,
) -> None:
    """ChromeBrowser initialization

    Args:
        executable_path: path to chromedriver, use `which chromedriver` to get the path.
            Default: /usr/bin/chromedriver
        headless_mode: run browser with headless mode or not. Default: True
        options: arguments for chrome.Options().
            Default: set("--disable-gpu", "--disable-software-rasterizer", "--disable-dev-shm-usage",
                "--window-size=1920x1080", "--disable-setuid-sandbox", "--no-sandbox", )
    """
    self.executable_path = executable_path
    self.headless_mode = headless_mode
    self.options = options if options is not None else self.default_options
    self.driver: Optional[webdriver.Chrome] = None

close(self)

Close the browser

Source code in extract_emails/browsers/chrome_browser.py
def close(self):
    """Close the browser"""
    self.driver.close()
    self.driver.quit()

get_page_source(self, url)

Get page source text from URL

Parameters:

Name Type Description Default
url str

URL

required

Returns:

Type Description
str

page source as text

Source code in extract_emails/browsers/chrome_browser.py
def get_page_source(self, url: str) -> str:
    """Get page source text from URL

    Args:
        url: URL

    Returns:
        page source as text
    """
    try:
        self.driver.get(url)
        time.sleep(self.wait_seconds_after_get)
        page_source = self.driver.page_source
    except Exception as e:
        logger.error(f"Could not get page source from {url}: {e}")
        return ""

    if "<html><head></head><body></body></html>" == page_source:
        logger.error(f"Could not get page source from {url}: Unknown reason")

    return page_source

open(self)

Add arguments to chrome.Options() and run the browser

Source code in extract_emails/browsers/chrome_browser.py
def open(self):
    """Add arguments to chrome.Options() and run the browser"""
    options = Options()
    for option in self.options:
        options.add_argument(option)

    if self.headless_mode:
        options.add_argument("--headless")

    self.driver = webdriver.Chrome(
        options=options, executable_path=self.executable_path
    )

RequestsBrowser

Wrapper on requests library

Examples:

>>> from extract_emails.browsers.requests_browser import RequestsBrowser
>>> browser = RequestsBrowser()
>>> page_source = browser.get_page_source('https://example.com')

__init__(self, headers=None) special

Parameters:

Name Type Description Default
headers Dict[str, Any]

headers for requests

None
Source code in extract_emails/browsers/requests_browser.py
def __init__(self, headers: Dict[str, Any] = None):
    """

    Args:
        headers: headers for requests
    """
    self.headers = headers
    self.session = requests.Session()

get_page_source(self, url)

Get page source text from URL

Parameters:

Name Type Description Default
url str

URL

required

Returns:

Type Description
str

page source as text

Source code in extract_emails/browsers/requests_browser.py
def get_page_source(self, url: str) -> str:
    """Get page source text from URL

    Args:
        url: URL

    Returns:
        page source as text
    """
    try:
        response = requests.get(url, headers=self.headers)
    except Exception as e:
        logger.error(f"Could not get page source from {url}: {e}")
        return ""
    return response.text