Browsers¶
PageSourceGetter
¶
All browsers must inherit from this class
get_page_source(self, url)
¶
Return page content from an URL
Parameters:
Name | Type | Description | Default |
---|---|---|---|
url |
str |
URL |
required |
Returns:
Type | Description |
---|---|
str |
page content (html, json, whatever) |
Source code in extract_emails/browsers/page_source_getter.py
@abstractmethod
def get_page_source(self, url: str) -> str:
"""Return page content from an URL
Args:
url: URL
Returns:
page content (html, json, whatever)
"""
ChromeBrowser
¶
Getting page sources with selenium and chromedriver
Examples:
>>> from extract_emails.browsers.chrome_browser import ChromeBrowser
>>> browser = ChromeBrowser()
>>> browser.open()
>>> page_source = browser.get_page_source('https://example.com')
>>> browser.close()
>>> from extract_emails.browsers.chrome_browser import ChromeBrowser
>>> with ChromeBrowser() as browser:
... page_source = browser.get_page_source('https://example.com')
__init__(self, executable_path='/usr/bin/chromedriver', headless_mode=True, options=None)
special
¶
ChromeBrowser initialization
Parameters:
Name | Type | Description | Default |
---|---|---|---|
executable_path |
PathLike |
path to chromedriver, use |
'/usr/bin/chromedriver' |
headless_mode |
bool |
run browser with headless mode or not. Default: True |
True |
options |
Iterable[str] |
arguments for chrome.Options(). Default: set("--disable-gpu", "--disable-software-rasterizer", "--disable-dev-shm-usage", "--window-size=1920x1080", "--disable-setuid-sandbox", "--no-sandbox", ) |
None |
Source code in extract_emails/browsers/chrome_browser.py
def __init__(
self,
executable_path: PathLike = "/usr/bin/chromedriver",
headless_mode: bool = True,
options: Iterable[str] = None,
) -> None:
"""ChromeBrowser initialization
Args:
executable_path: path to chromedriver, use `which chromedriver` to get the path.
Default: /usr/bin/chromedriver
headless_mode: run browser with headless mode or not. Default: True
options: arguments for chrome.Options().
Default: set("--disable-gpu", "--disable-software-rasterizer", "--disable-dev-shm-usage",
"--window-size=1920x1080", "--disable-setuid-sandbox", "--no-sandbox", )
"""
self.executable_path = executable_path
self.headless_mode = headless_mode
self.options = options if options is not None else self.default_options
self.driver: Optional[webdriver.Chrome] = None
close(self)
¶
Close the browser
Source code in extract_emails/browsers/chrome_browser.py
def close(self):
"""Close the browser"""
self.driver.close()
self.driver.quit()
get_page_source(self, url)
¶
Get page source text from URL
Parameters:
Name | Type | Description | Default |
---|---|---|---|
url |
str |
URL |
required |
Returns:
Type | Description |
---|---|
str |
page source as text |
Source code in extract_emails/browsers/chrome_browser.py
def get_page_source(self, url: str) -> str:
"""Get page source text from URL
Args:
url: URL
Returns:
page source as text
"""
try:
self.driver.get(url)
time.sleep(self.wait_seconds_after_get)
page_source = self.driver.page_source
except Exception as e:
logger.error(f"Could not get page source from {url}: {e}")
return ""
if "<html><head></head><body></body></html>" == page_source:
logger.error(f"Could not get page source from {url}: Unknown reason")
return page_source
open(self)
¶
Add arguments to chrome.Options() and run the browser
Source code in extract_emails/browsers/chrome_browser.py
def open(self):
"""Add arguments to chrome.Options() and run the browser"""
options = Options()
for option in self.options:
options.add_argument(option)
if self.headless_mode:
options.add_argument("--headless")
self.driver = webdriver.Chrome(
options=options, executable_path=self.executable_path
)
RequestsBrowser
¶
Wrapper on requests library
Examples:
>>> from extract_emails.browsers.requests_browser import RequestsBrowser
>>> browser = RequestsBrowser()
>>> page_source = browser.get_page_source('https://example.com')
__init__(self, headers=None)
special
¶
Parameters:
Name | Type | Description | Default |
---|---|---|---|
headers |
Dict[str, Any] |
headers for requests |
None |
Source code in extract_emails/browsers/requests_browser.py
def __init__(self, headers: Dict[str, Any] = None):
"""
Args:
headers: headers for requests
"""
self.headers = headers
self.session = requests.Session()
get_page_source(self, url)
¶
Get page source text from URL
Parameters:
Name | Type | Description | Default |
---|---|---|---|
url |
str |
URL |
required |
Returns:
Type | Description |
---|---|
str |
page source as text |
Source code in extract_emails/browsers/requests_browser.py
def get_page_source(self, url: str) -> str:
"""Get page source text from URL
Args:
url: URL
Returns:
page source as text
"""
try:
response = requests.get(url, headers=self.headers)
except Exception as e:
logger.error(f"Could not get page source from {url}: {e}")
return ""
return response.text