12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182 |
- import hashlib
- import logging
- from typing import Optional
- from embedchain.config.add_config import ChunkerConfig
- from embedchain.helpers.json_serializable import JSONSerializable
- from embedchain.models.data_type import DataType
- class BaseChunker(JSONSerializable):
- def __init__(self, text_splitter):
- """Initialize the chunker."""
- self.text_splitter = text_splitter
- self.data_type = None
- def create_chunks(self, loader, src, app_id=None, config: Optional[ChunkerConfig] = None):
- """
- Loads data and chunks it.
- :param loader: The loader whose `load_data` method is used to create
- the raw data.
- :param src: The data to be handled by the loader. Can be a URL for
- remote sources or local content for local loaders.
- :param app_id: App id used to generate the doc_id.
- """
- documents = []
- chunk_ids = []
- id_map = {}
- min_chunk_size = config.min_chunk_size if config is not None else 1
- logging.info(f"[INFO] Skipping chunks smaller than {min_chunk_size} characters")
- data_result = loader.load_data(src)
- data_records = data_result["data"]
- doc_id = data_result["doc_id"]
- # Prefix app_id in the document id if app_id is not None to
- # distinguish between different documents stored in the same
- # elasticsearch or opensearch index
- doc_id = f"{app_id}--{doc_id}" if app_id is not None else doc_id
- metadatas = []
- for data in data_records:
- content = data["content"]
- meta_data = data["meta_data"]
- # add data type to meta data to allow query using data type
- meta_data["data_type"] = self.data_type.value
- meta_data["doc_id"] = doc_id
- url = meta_data["url"]
- chunks = self.get_chunks(content)
- for chunk in chunks:
- chunk_id = hashlib.sha256((chunk + url).encode()).hexdigest()
- chunk_id = f"{app_id}--{chunk_id}" if app_id is not None else chunk_id
- if id_map.get(chunk_id) is None and len(chunk) >= min_chunk_size:
- id_map[chunk_id] = True
- chunk_ids.append(chunk_id)
- documents.append(chunk)
- metadatas.append(meta_data)
- return {
- "documents": documents,
- "ids": chunk_ids,
- "metadatas": metadatas,
- "doc_id": doc_id,
- }
- def get_chunks(self, content):
- """
- Returns chunks using text splitter instance.
- Override in child class if custom logic.
- """
- return self.text_splitter.split_text(content)
- def set_data_type(self, data_type: DataType):
- """
- set the data type of chunker
- """
- self.data_type = data_type
- # TODO: This should be done during initialization. This means it has to be done in the child classes.
- @staticmethod
- def get_word_count(documents) -> int:
- return sum([len(document.split(" ")) for document in documents])
|