base_chunker.py 1.2 KB

123456789101112131415161718192021222324252627282930313233343536373839
  1. import hashlib
  2. class BaseChunker:
  3. def __init__(self, text_splitter):
  4. ''' Initialize the chunker. '''
  5. self.text_splitter = text_splitter
  6. def create_chunks(self, loader, src):
  7. """
  8. Loads data and chunks it.
  9. :param loader: The loader which's `load_data` method is used to create the raw data.
  10. :param src: The data to be handled by the loader. Can be a URL for remote sources or local content for local loaders.
  11. """
  12. documents = []
  13. ids = []
  14. idMap = {}
  15. datas = loader.load_data(src)
  16. metadatas = []
  17. for data in datas:
  18. content = data["content"]
  19. meta_data = data["meta_data"]
  20. url = meta_data["url"]
  21. chunks = self.text_splitter.split_text(content)
  22. for chunk in chunks:
  23. chunk_id = hashlib.sha256((chunk + url).encode()).hexdigest()
  24. if (idMap.get(chunk_id) is None):
  25. idMap[chunk_id] = True
  26. ids.append(chunk_id)
  27. documents.append(chunk)
  28. metadatas.append(meta_data)
  29. return {
  30. "documents": documents,
  31. "ids": ids,
  32. "metadatas": metadatas,
  33. }