Skip to content

Models

PageData

Bases: BaseModel

Representation for data from a webpage

Examples:

>>> from extract_emails.models import PageData
>>> page_data = PageData(website='https://example.com', page_url='https://example.com/page123')

Attributes:

Name Type Description
website str

website address from where data

page_url str

Page URL from where data

data Optional[Dict[str, List[str]]]

Data from the page in format: { 'label': [data, data] }, default: {}

Source code in extract_emails/models/page_data.py
 10
 11
 12
 13
 14
 15
 16
 17
 18
 19
 20
 21
 22
 23
 24
 25
 26
 27
 28
 29
 30
 31
 32
 33
 34
 35
 36
 37
 38
 39
 40
 41
 42
 43
 44
 45
 46
 47
 48
 49
 50
 51
 52
 53
 54
 55
 56
 57
 58
 59
 60
 61
 62
 63
 64
 65
 66
 67
 68
 69
 70
 71
 72
 73
 74
 75
 76
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
class PageData(BaseModel):
    """Representation for data from a webpage

    Examples:
        >>> from extract_emails.models import PageData
        >>> page_data = PageData(website='https://example.com', page_url='https://example.com/page123')

    Attributes:
        website (str): website address from where data
        page_url (str): Page URL from where data
        data (Optional[Dict[str, List[str]]]): Data from the page in format: { 'label': [data, data] }, default: {}
    """

    website: str
    page_url: str
    data: dict[str, list[str]] = Field(default_factory=dict)

    def __len__(self) -> int:
        if len(self.data) == 0:
            return 0
        return sum(len(i) for i in self.data.values())

    def append(self, label: str, vals: list[str]) -> None:
        """Append data from a page to the self.data collection

        Examples:
            >>> from extract_emails.models import PageData
            >>> page_data = PageData(website='https://example.com', page_url='https://example.com/page123')
            >>> page_data.append('email', ['email1@email.com', 'email2@email.com'])
            >>> page_data.page
            >>> {'email': ['email@email.com', 'email2@email.com']}

        Args:
            label: name of collection, e.g. email, linkedin
            vals: data from a page, e.g. emails, specific URLs etc.
        """
        try:
            self.data[label].extend(vals)
        except KeyError:
            self.data[label] = vals

    @classmethod
    def to_csv(cls, data: list["PageData"], filepath: Path) -> None:
        """Save list of `PageData` to CSV file

        Args:
            data: list of `PageData`
            filepath: path to a CSV file
        """
        base_headers: list[str] = list(cls.model_json_schema()["properties"].keys())
        base_headers.remove("data")
        data_headers = [i for i in data[0].data.keys()]
        headers = base_headers + data_headers
        is_file_exists = filepath.exists()

        with open(filepath, "a", encoding="utf-8", newline="") as f:
            writer = csv.DictWriter(f, fieldnames=headers)
            if not is_file_exists:
                writer.writeheader()
            for page in data:
                for data_in_row in zip_longest(*page.data.values()):
                    new_row = {"website": page.website, "page_url": page.page_url}
                    for counter, column in enumerate(data_headers):
                        new_row[column] = data_in_row[counter]

                    writer.writerow(new_row)

    @classmethod
    async def ato_csv(cls, data: list["PageData"], filepath: Path) -> None:
        """Async save list of `PageData` to CSV file

        Args:
            data: list of `PageData`
            filepath: path to a CSV file
        """
        base_headers: list[str] = list(cls.model_json_schema()["properties"].keys())
        base_headers.remove("data")
        data_headers = [i for i in data[0].data.keys()]
        headers = base_headers + data_headers
        is_file_exists = filepath.exists()

        async with aiofiles.open(filepath, "a", encoding="utf-8", newline="") as f:
            writer = AsyncDictWriter(f, fieldnames=headers)
            if not is_file_exists:
                await writer.writeheader()
            for page in data:
                for data_in_row in zip_longest(*page.data.values()):
                    new_row = {"website": page.website, "page_url": page.page_url}
                    for counter, column in enumerate(data_headers):
                        new_row[column] = data_in_row[counter]

                    await writer.writerow(new_row)

append(label, vals)

Append data from a page to the self.data collection

Examples:

>>> from extract_emails.models import PageData
>>> page_data = PageData(website='https://example.com', page_url='https://example.com/page123')
>>> page_data.append('email', ['email1@email.com', 'email2@email.com'])
>>> page_data.page
>>> {'email': ['email@email.com', 'email2@email.com']}

Parameters:

Name Type Description Default
label str

name of collection, e.g. email, linkedin

required
vals list[str]

data from a page, e.g. emails, specific URLs etc.

required
Source code in extract_emails/models/page_data.py
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
def append(self, label: str, vals: list[str]) -> None:
    """Append data from a page to the self.data collection

    Examples:
        >>> from extract_emails.models import PageData
        >>> page_data = PageData(website='https://example.com', page_url='https://example.com/page123')
        >>> page_data.append('email', ['email1@email.com', 'email2@email.com'])
        >>> page_data.page
        >>> {'email': ['email@email.com', 'email2@email.com']}

    Args:
        label: name of collection, e.g. email, linkedin
        vals: data from a page, e.g. emails, specific URLs etc.
    """
    try:
        self.data[label].extend(vals)
    except KeyError:
        self.data[label] = vals

ato_csv(data, filepath) async classmethod

Async save list of PageData to CSV file

Parameters:

Name Type Description Default
data list[PageData]

list of PageData

required
filepath Path

path to a CSV file

required
Source code in extract_emails/models/page_data.py
 77
 78
 79
 80
 81
 82
 83
 84
 85
 86
 87
 88
 89
 90
 91
 92
 93
 94
 95
 96
 97
 98
 99
100
101
@classmethod
async def ato_csv(cls, data: list["PageData"], filepath: Path) -> None:
    """Async save list of `PageData` to CSV file

    Args:
        data: list of `PageData`
        filepath: path to a CSV file
    """
    base_headers: list[str] = list(cls.model_json_schema()["properties"].keys())
    base_headers.remove("data")
    data_headers = [i for i in data[0].data.keys()]
    headers = base_headers + data_headers
    is_file_exists = filepath.exists()

    async with aiofiles.open(filepath, "a", encoding="utf-8", newline="") as f:
        writer = AsyncDictWriter(f, fieldnames=headers)
        if not is_file_exists:
            await writer.writeheader()
        for page in data:
            for data_in_row in zip_longest(*page.data.values()):
                new_row = {"website": page.website, "page_url": page.page_url}
                for counter, column in enumerate(data_headers):
                    new_row[column] = data_in_row[counter]

                await writer.writerow(new_row)

to_csv(data, filepath) classmethod

Save list of PageData to CSV file

Parameters:

Name Type Description Default
data list[PageData]

list of PageData

required
filepath Path

path to a CSV file

required
Source code in extract_emails/models/page_data.py
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
@classmethod
def to_csv(cls, data: list["PageData"], filepath: Path) -> None:
    """Save list of `PageData` to CSV file

    Args:
        data: list of `PageData`
        filepath: path to a CSV file
    """
    base_headers: list[str] = list(cls.model_json_schema()["properties"].keys())
    base_headers.remove("data")
    data_headers = [i for i in data[0].data.keys()]
    headers = base_headers + data_headers
    is_file_exists = filepath.exists()

    with open(filepath, "a", encoding="utf-8", newline="") as f:
        writer = csv.DictWriter(f, fieldnames=headers)
        if not is_file_exists:
            writer.writeheader()
        for page in data:
            for data_in_row in zip_longest(*page.data.values()):
                new_row = {"website": page.website, "page_url": page.page_url}
                for counter, column in enumerate(data_headers):
                    new_row[column] = data_in_row[counter]

                writer.writerow(new_row)