import hashlib import logging import os import quopri from textwrap import dedent from bs4 import BeautifulSoup try: from llama_index import download_loader except ImportError: raise ImportError("Notion requires extra dependencies. Install with `pip install embedchain[community]`") from None from embedchain.loaders.base_loader import BaseLoader from embedchain.utils import clean_string def get_header(text: str, header: str) -> str: start_string_position = text.find(header) pos_start = text.find(":", start_string_position) + 1 pos_end = text.find("\n", pos_start) header = text[pos_start:pos_end] return header.strip() class GmailLoader(BaseLoader): def load_data(self, query): """Load data from gmail.""" if not os.path.isfile("credentials.json"): raise FileNotFoundError( "You must download the valid credentials file from your google \ dev account. Refer this `https://cloud.google.com/docs/authentication/api-keys`" ) GmailReader = download_loader("GmailReader") loader = GmailReader(query=query, service=None, results_per_page=20) documents = loader.load_data() logging.info(f"Gmail Loader: {len(documents)} mails found for query- {query}") data = [] data_contents = [] logging.info(f"Gmail Loader: {len(documents)} mails found") for document in documents: original_size = len(document.text) snippet = document.metadata.get("snippet") meta_data = { "url": document.metadata.get("id"), "date": get_header(document.text, "Date"), "subject": get_header(document.text, "Subject"), "from": get_header(document.text, "From"), "to": get_header(document.text, "To"), "search_query": query, } # Decode decoded_bytes = quopri.decodestring(document.text) decoded_str = decoded_bytes.decode("utf-8", errors="replace") # Slice mail_start = decoded_str.find("