Skip to content

Models

PageData pydantic-model

Representation for data from a webpage

Examples:

>>> from extract_emails.models import PageData
>>> page_data = PageData(website='https://example.com', page_url='https://example.com/page123')

Attributes:

Name Type Description
website str

website address from where data

page_url str

Page URL from where data

data Optional[Dict[str, List[str]]]

Data from the page in format: { 'label': [data, data] }, default: {}

append(self, label, vals)

Append data from a page to the self.data collection

Examples:

>>> from extract_emails.models import PageData
>>> page_data = PageData(website='https://example.com', page_url='https://example.com/page123')
>>> page_data.append('email', ['email1@email.com', 'email2@email.com'])
>>> page_data.page
>>> {'email': ['email@email.com', 'email2@email.com']}

Parameters:

Name Type Description Default
label str

name of collection, e.g. email, linkedin

required
vals List[str]

data from a page, e.g. emails, specific URLs etc.

required
Source code in extract_emails/models/page_data.py
def append(self, label: str, vals: List[str]) -> None:
    """Append data from a page to the self.data collection

    Examples:
        >>> from extract_emails.models import PageData
        >>> page_data = PageData(website='https://example.com', page_url='https://example.com/page123')
        >>> page_data.append('email', ['email1@email.com', 'email2@email.com'])
        >>> page_data.page
        >>> {'email': ['email@email.com', 'email2@email.com']}

    Args:
        label: name of collection, e.g. email, linkedin
        vals: data from a page, e.g. emails, specific URLs etc.
    """
    try:
        self.data[label].extend(vals)
    except KeyError:
        self.data[label] = vals

save_as_csv(data, filepath) classmethod

Save list of PageData to CSV file

Parameters:

Name Type Description Default
data List[PageData]

list of PageData

required
filepath PathLike

path to a CSV file

required
Source code in extract_emails/models/page_data.py
@classmethod
def save_as_csv(cls, data: List["PageData"], filepath: PathLike) -> None:
    """Save list of `PageData` to CSV file

    Args:
        data: list of `PageData`
        filepath: path to a CSV file
    """
    base_headers: List[str] = list(cls.schema()["properties"].keys())
    base_headers.remove("data")
    data_headers = [i for i in data[0].data.keys()]
    headers = base_headers + data_headers

    with open(filepath, "w", encoding="utf-8", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=headers)
        writer.writeheader()
        for page in data:
            for data_in_row in zip_longest(*page.data.values()):
                new_row = {"website": page.website, "page_url": page.page_url}
                for counter, column in enumerate(data_headers):
                    new_row[column] = data_in_row[counter]

                writer.writerow(new_row)