notion.py 4.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119
  1. import hashlib
  2. import logging
  3. import os
  4. from typing import Any, Optional
  5. import requests
  6. from embedchain.helpers.json_serializable import register_deserializable
  7. from embedchain.loaders.base_loader import BaseLoader
  8. from embedchain.utils.misc import clean_string
  9. class NotionDocument:
  10. """
  11. A simple Document class to hold the text and additional information of a page.
  12. """
  13. def __init__(self, text: str, extra_info: dict[str, Any]):
  14. self.text = text
  15. self.extra_info = extra_info
  16. class NotionPageLoader:
  17. """
  18. Notion Page Loader.
  19. Reads a set of Notion pages.
  20. """
  21. BLOCK_CHILD_URL_TMPL = "https://api.notion.com/v1/blocks/{block_id}/children"
  22. def __init__(self, integration_token: Optional[str] = None) -> None:
  23. """Initialize with Notion integration token."""
  24. if integration_token is None:
  25. integration_token = os.getenv("NOTION_INTEGRATION_TOKEN")
  26. if integration_token is None:
  27. raise ValueError(
  28. "Must specify `integration_token` or set environment " "variable `NOTION_INTEGRATION_TOKEN`."
  29. )
  30. self.token = integration_token
  31. self.headers = {
  32. "Authorization": "Bearer " + self.token,
  33. "Content-Type": "application/json",
  34. "Notion-Version": "2022-06-28",
  35. }
  36. def _read_block(self, block_id: str, num_tabs: int = 0) -> str:
  37. """Read a block from Notion."""
  38. done = False
  39. result_lines_arr = []
  40. cur_block_id = block_id
  41. while not done:
  42. block_url = self.BLOCK_CHILD_URL_TMPL.format(block_id=cur_block_id)
  43. res = requests.get(block_url, headers=self.headers)
  44. data = res.json()
  45. for result in data["results"]:
  46. result_type = result["type"]
  47. result_obj = result[result_type]
  48. cur_result_text_arr = []
  49. if "rich_text" in result_obj:
  50. for rich_text in result_obj["rich_text"]:
  51. if "text" in rich_text:
  52. text = rich_text["text"]["content"]
  53. prefix = "\t" * num_tabs
  54. cur_result_text_arr.append(prefix + text)
  55. result_block_id = result["id"]
  56. has_children = result["has_children"]
  57. if has_children:
  58. children_text = self._read_block(result_block_id, num_tabs=num_tabs + 1)
  59. cur_result_text_arr.append(children_text)
  60. cur_result_text = "\n".join(cur_result_text_arr)
  61. result_lines_arr.append(cur_result_text)
  62. if data["next_cursor"] is None:
  63. done = True
  64. else:
  65. cur_block_id = data["next_cursor"]
  66. result_lines = "\n".join(result_lines_arr)
  67. return result_lines
  68. def load_data(self, page_ids: list[str]) -> list[NotionDocument]:
  69. """Load data from the given list of page IDs."""
  70. docs = []
  71. for page_id in page_ids:
  72. page_text = self._read_block(page_id)
  73. docs.append(NotionDocument(text=page_text, extra_info={"page_id": page_id}))
  74. return docs
  75. @register_deserializable
  76. class NotionLoader(BaseLoader):
  77. def load_data(self, source):
  78. """Load data from a Notion URL."""
  79. id = source[-32:]
  80. formatted_id = f"{id[:8]}-{id[8:12]}-{id[12:16]}-{id[16:20]}-{id[20:]}"
  81. logging.debug(f"Extracted notion page id as: {formatted_id}")
  82. integration_token = os.getenv("NOTION_INTEGRATION_TOKEN")
  83. reader = NotionPageLoader(integration_token=integration_token)
  84. documents = reader.load_data(page_ids=[formatted_id])
  85. raw_text = documents[0].text
  86. text = clean_string(raw_text)
  87. doc_id = hashlib.sha256((text + source).encode()).hexdigest()
  88. return {
  89. "doc_id": doc_id,
  90. "data": [
  91. {
  92. "content": text,
  93. "meta_data": {"url": f"notion-{formatted_id}"},
  94. }
  95. ],
  96. }