Models¶

`PageData` ¶

Bases: BaseModel

Representation for data from a webpage

Examples:

>>> from extract_emails.models import PageData
>>> page_data = PageData(website='https://example.com', page_url='https://example.com/page123')

Attributes:

Name	Type	Description
`website`	`str`	website address from where data
`page_url`	`str`	Page URL from where data
`data`	`Optional[Dict[str, List[str]]]`	Data from the page in format: { 'label': [data, data] }, default: {}

Source code in extract_emails/models/page_data.py

class PageData(BaseModel):
    """Representation for data from a webpage

    Examples:
        >>> from extract_emails.models import PageData
        >>> page_data = PageData(website='https://example.com', page_url='https://example.com/page123')

    Attributes:
        website (str): website address from where data
        page_url (str): Page URL from where data
        data (Optional[Dict[str, List[str]]]): Data from the page in format: { 'label': [data, data] }, default: {}
    """

    website: str
    page_url: str
    data: dict[str, list[str]] = Field(default_factory=dict)

    def __len__(self) -> int:
        if len(self.data) == 0:
            return 0
        return sum(len(i) for i in self.data.values())

    def append(self, label: str, vals: list[str]) -> None:
        """Append data from a page to the self.data collection

        Examples:
            >>> from extract_emails.models import PageData
            >>> page_data = PageData(website='https://example.com', page_url='https://example.com/page123')
            >>> page_data.append('email', ['email1@email.com', 'email2@email.com'])
            >>> page_data.page
            >>> {'email': ['email@email.com', 'email2@email.com']}

        Args:
            label: name of collection, e.g. email, linkedin
            vals: data from a page, e.g. emails, specific URLs etc.
        """
        try:
            self.data[label].extend(vals)
        except KeyError:
            self.data[label] = vals

    @classmethod
    def to_csv(cls, data: list["PageData"], filepath: Path) -> None:
        """Save list of `PageData` to CSV file

        Args:
            data: list of `PageData`
            filepath: path to a CSV file
        """
        base_headers: list[str] = list(cls.model_json_schema()["properties"].keys())
        base_headers.remove("data")
        data_headers = [i for i in data[0].data.keys()]
        headers = base_headers + data_headers
        is_file_exists = filepath.exists()

        with open(filepath, "a", encoding="utf-8", newline="") as f:
            writer = csv.DictWriter(f, fieldnames=headers)
            if not is_file_exists:
                writer.writeheader()
            for page in data:
                for data_in_row in zip_longest(*page.data.values()):
                    new_row = {"website": page.website, "page_url": page.page_url}
                    for counter, column in enumerate(data_headers):
                        new_row[column] = data_in_row[counter]

                    writer.writerow(new_row)

    @classmethod
    async def ato_csv(cls, data: list["PageData"], filepath: Path) -> None:
        """Async save list of `PageData` to CSV file

        Args:
            data: list of `PageData`
            filepath: path to a CSV file
        """
        base_headers: list[str] = list(cls.model_json_schema()["properties"].keys())
        base_headers.remove("data")
        data_headers = [i for i in data[0].data.keys()]
        headers = base_headers + data_headers
        is_file_exists = filepath.exists()

        async with aiofiles.open(filepath, "a", encoding="utf-8", newline="") as f:
            writer = AsyncDictWriter(f, fieldnames=headers)
            if not is_file_exists:
                await writer.writeheader()
            for page in data:
                for data_in_row in zip_longest(*page.data.values()):
                    new_row = {"website": page.website, "page_url": page.page_url}
                    for counter, column in enumerate(data_headers):
                        new_row[column] = data_in_row[counter]

                    await writer.writerow(new_row)

`append(label, vals)` ¶

Append data from a page to the self.data collection

Examples:

>>> from extract_emails.models import PageData
>>> page_data = PageData(website='https://example.com', page_url='https://example.com/page123')
>>> page_data.append('email', ['email1@email.com', 'email2@email.com'])
>>> page_data.page
>>> {'email': ['email@email.com', 'email2@email.com']}

Parameters:

Name	Type	Description	Default
`label`	`str`	name of collection, e.g. email, linkedin	required
`vals`	`list[str]`	data from a page, e.g. emails, specific URLs etc.	required

Source code in extract_emails/models/page_data.py

def append(self, label: str, vals: list[str]) -> None:
    """Append data from a page to the self.data collection

    Examples:
        >>> from extract_emails.models import PageData
        >>> page_data = PageData(website='https://example.com', page_url='https://example.com/page123')
        >>> page_data.append('email', ['email1@email.com', 'email2@email.com'])
        >>> page_data.page
        >>> {'email': ['email@email.com', 'email2@email.com']}

    Args:
        label: name of collection, e.g. email, linkedin
        vals: data from a page, e.g. emails, specific URLs etc.
    """
    try:
        self.data[label].extend(vals)
    except KeyError:
        self.data[label] = vals

`ato_csv(data, filepath)` `async` `classmethod` ¶

Async save list of PageData to CSV file

Parameters:

Name	Type	Description	Default
`data`	`list[PageData]`	list of `PageData`	required
`filepath`	`Path`	path to a CSV file	required

Source code in extract_emails/models/page_data.py

@classmethod
async def ato_csv(cls, data: list["PageData"], filepath: Path) -> None:
    """Async save list of `PageData` to CSV file

    Args:
        data: list of `PageData`
        filepath: path to a CSV file
    """
    base_headers: list[str] = list(cls.model_json_schema()["properties"].keys())
    base_headers.remove("data")
    data_headers = [i for i in data[0].data.keys()]
    headers = base_headers + data_headers
    is_file_exists = filepath.exists()

    async with aiofiles.open(filepath, "a", encoding="utf-8", newline="") as f:
        writer = AsyncDictWriter(f, fieldnames=headers)
        if not is_file_exists:
            await writer.writeheader()
        for page in data:
            for data_in_row in zip_longest(*page.data.values()):
                new_row = {"website": page.website, "page_url": page.page_url}
                for counter, column in enumerate(data_headers):
                    new_row[column] = data_in_row[counter]

                await writer.writerow(new_row)

`to_csv(data, filepath)` `classmethod` ¶

Save list of PageData to CSV file

Parameters:

Name	Type	Description	Default
`data`	`list[PageData]`	list of `PageData`	required
`filepath`	`Path`	path to a CSV file	required

Source code in extract_emails/models/page_data.py

@classmethod
def to_csv(cls, data: list["PageData"], filepath: Path) -> None:
    """Save list of `PageData` to CSV file

    Args:
        data: list of `PageData`
        filepath: path to a CSV file
    """
    base_headers: list[str] = list(cls.model_json_schema()["properties"].keys())
    base_headers.remove("data")
    data_headers = [i for i in data[0].data.keys()]
    headers = base_headers + data_headers
    is_file_exists = filepath.exists()

    with open(filepath, "a", encoding="utf-8", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=headers)
        if not is_file_exists:
            writer.writeheader()
        for page in data:
            for data_in_row in zip_longest(*page.data.values()):
                new_row = {"website": page.website, "page_url": page.page_url}
                for counter, column in enumerate(data_headers):
                    new_row[column] = data_in_row[counter]

                writer.writerow(new_row)

Models¶

PageData ¶

append(label, vals) ¶

ato_csv(data, filepath) async classmethod ¶

to_csv(data, filepath) classmethod ¶

`PageData` ¶

`append(label, vals)` ¶

`ato_csv(data, filepath)` `async` `classmethod` ¶

`to_csv(data, filepath)` `classmethod` ¶