base_chunker.py 819 B

123456789101112131415161718192021222324252627
  1. import hashlib
  2. class BaseChunker:
  3. def __init__(self, text_splitter):
  4. self.text_splitter = text_splitter
  5. def create_chunks(self, loader, url):
  6. documents = []
  7. ids = []
  8. datas = loader.load_data(url)
  9. metadatas = []
  10. for data in datas:
  11. content = data["content"]
  12. meta_data = data["meta_data"]
  13. chunks = self.text_splitter.split_text(content)
  14. url = meta_data["url"]
  15. for chunk in chunks:
  16. chunk_id = hashlib.sha256((chunk + url).encode()).hexdigest()
  17. ids.append(chunk_id)
  18. documents.append(chunk)
  19. metadatas.append(meta_data)
  20. return {
  21. "documents": documents,
  22. "ids": ids,
  23. "metadatas": metadatas,
  24. }