|
@@ -11,6 +11,8 @@ from tqdm import tqdm
|
|
|
|
|
|
from embedchain.models.data_type import DataType
|
|
|
|
|
|
+logger = logging.getLogger(__name__)
|
|
|
+
|
|
|
|
|
|
def parse_content(content, type):
|
|
|
implemented = ["html.parser", "lxml", "lxml-xml", "xml", "html5lib"]
|
|
@@ -61,7 +63,7 @@ def parse_content(content, type):
|
|
|
|
|
|
cleaned_size = len(content)
|
|
|
if original_size != 0:
|
|
|
- logging.info(
|
|
|
+ logger.info(
|
|
|
f"Cleaned page size: {cleaned_size} characters, down from {original_size} (shrunk: {original_size-cleaned_size} chars, {round((1-(cleaned_size/original_size)) * 100, 2)}%)" # noqa:E501
|
|
|
)
|
|
|
|
|
@@ -208,31 +210,31 @@ def detect_datatype(source: Any) -> DataType:
|
|
|
}
|
|
|
|
|
|
if url.netloc in YOUTUBE_ALLOWED_NETLOCKS:
|
|
|
- logging.debug(f"Source of `{formatted_source}` detected as `youtube_video`.")
|
|
|
+ logger.debug(f"Source of `{formatted_source}` detected as `youtube_video`.")
|
|
|
return DataType.YOUTUBE_VIDEO
|
|
|
|
|
|
if url.netloc in {"notion.so", "notion.site"}:
|
|
|
- logging.debug(f"Source of `{formatted_source}` detected as `notion`.")
|
|
|
+ logger.debug(f"Source of `{formatted_source}` detected as `notion`.")
|
|
|
return DataType.NOTION
|
|
|
|
|
|
if url.path.endswith(".pdf"):
|
|
|
- logging.debug(f"Source of `{formatted_source}` detected as `pdf_file`.")
|
|
|
+ logger.debug(f"Source of `{formatted_source}` detected as `pdf_file`.")
|
|
|
return DataType.PDF_FILE
|
|
|
|
|
|
if url.path.endswith(".xml"):
|
|
|
- logging.debug(f"Source of `{formatted_source}` detected as `sitemap`.")
|
|
|
+ logger.debug(f"Source of `{formatted_source}` detected as `sitemap`.")
|
|
|
return DataType.SITEMAP
|
|
|
|
|
|
if url.path.endswith(".csv"):
|
|
|
- logging.debug(f"Source of `{formatted_source}` detected as `csv`.")
|
|
|
+ logger.debug(f"Source of `{formatted_source}` detected as `csv`.")
|
|
|
return DataType.CSV
|
|
|
|
|
|
if url.path.endswith(".mdx") or url.path.endswith(".md"):
|
|
|
- logging.debug(f"Source of `{formatted_source}` detected as `mdx`.")
|
|
|
+ logger.debug(f"Source of `{formatted_source}` detected as `mdx`.")
|
|
|
return DataType.MDX
|
|
|
|
|
|
if url.path.endswith(".docx"):
|
|
|
- logging.debug(f"Source of `{formatted_source}` detected as `docx`.")
|
|
|
+ logger.debug(f"Source of `{formatted_source}` detected as `docx`.")
|
|
|
return DataType.DOCX
|
|
|
|
|
|
if url.path.endswith(".yaml"):
|
|
@@ -242,14 +244,14 @@ def detect_datatype(source: Any) -> DataType:
|
|
|
try:
|
|
|
yaml_content = yaml.safe_load(response.text)
|
|
|
except yaml.YAMLError as exc:
|
|
|
- logging.error(f"Error parsing YAML: {exc}")
|
|
|
+ logger.error(f"Error parsing YAML: {exc}")
|
|
|
raise TypeError(f"Not a valid data type. Error loading YAML: {exc}")
|
|
|
|
|
|
if is_openapi_yaml(yaml_content):
|
|
|
- logging.debug(f"Source of `{formatted_source}` detected as `openapi`.")
|
|
|
+ logger.debug(f"Source of `{formatted_source}` detected as `openapi`.")
|
|
|
return DataType.OPENAPI
|
|
|
else:
|
|
|
- logging.error(
|
|
|
+ logger.error(
|
|
|
f"Source of `{formatted_source}` does not contain all the required \
|
|
|
fields of OpenAPI yaml. Check 'https://spec.openapis.org/oas/v3.1.0'"
|
|
|
)
|
|
@@ -258,35 +260,35 @@ def detect_datatype(source: Any) -> DataType:
|
|
|
make sure you have all the required fields in YAML config data"
|
|
|
)
|
|
|
except requests.exceptions.RequestException as e:
|
|
|
- logging.error(f"Error fetching URL {formatted_source}: {e}")
|
|
|
+ logger.error(f"Error fetching URL {formatted_source}: {e}")
|
|
|
|
|
|
if url.path.endswith(".json"):
|
|
|
- logging.debug(f"Source of `{formatted_source}` detected as `json_file`.")
|
|
|
+ logger.debug(f"Source of `{formatted_source}` detected as `json_file`.")
|
|
|
return DataType.JSON
|
|
|
|
|
|
if "docs" in url.netloc or ("docs" in url.path and url.scheme != "file"):
|
|
|
# `docs_site` detection via path is not accepted for local filesystem URIs,
|
|
|
# because that would mean all paths that contain `docs` are now doc sites, which is too aggressive.
|
|
|
- logging.debug(f"Source of `{formatted_source}` detected as `docs_site`.")
|
|
|
+ logger.debug(f"Source of `{formatted_source}` detected as `docs_site`.")
|
|
|
return DataType.DOCS_SITE
|
|
|
|
|
|
if "github.com" in url.netloc:
|
|
|
- logging.debug(f"Source of `{formatted_source}` detected as `github`.")
|
|
|
+ logger.debug(f"Source of `{formatted_source}` detected as `github`.")
|
|
|
return DataType.GITHUB
|
|
|
|
|
|
if is_google_drive_folder(url.netloc + url.path):
|
|
|
- logging.debug(f"Source of `{formatted_source}` detected as `google drive folder`.")
|
|
|
+ logger.debug(f"Source of `{formatted_source}` detected as `google drive folder`.")
|
|
|
return DataType.GOOGLE_DRIVE_FOLDER
|
|
|
|
|
|
# If none of the above conditions are met, it's a general web page
|
|
|
- logging.debug(f"Source of `{formatted_source}` detected as `web_page`.")
|
|
|
+ logger.debug(f"Source of `{formatted_source}` detected as `web_page`.")
|
|
|
return DataType.WEB_PAGE
|
|
|
|
|
|
elif not isinstance(source, str):
|
|
|
# For datatypes where source is not a string.
|
|
|
|
|
|
if isinstance(source, tuple) and len(source) == 2 and isinstance(source[0], str) and isinstance(source[1], str):
|
|
|
- logging.debug(f"Source of `{formatted_source}` detected as `qna_pair`.")
|
|
|
+ logger.debug(f"Source of `{formatted_source}` detected as `qna_pair`.")
|
|
|
return DataType.QNA_PAIR
|
|
|
|
|
|
# Raise an error if it isn't a string and also not a valid non-string type (one of the previous).
|
|
@@ -300,37 +302,37 @@ def detect_datatype(source: Any) -> DataType:
|
|
|
# Note: checking for string is not necessary anymore.
|
|
|
|
|
|
if source.endswith(".docx"):
|
|
|
- logging.debug(f"Source of `{formatted_source}` detected as `docx`.")
|
|
|
+ logger.debug(f"Source of `{formatted_source}` detected as `docx`.")
|
|
|
return DataType.DOCX
|
|
|
|
|
|
if source.endswith(".csv"):
|
|
|
- logging.debug(f"Source of `{formatted_source}` detected as `csv`.")
|
|
|
+ logger.debug(f"Source of `{formatted_source}` detected as `csv`.")
|
|
|
return DataType.CSV
|
|
|
|
|
|
if source.endswith(".xml"):
|
|
|
- logging.debug(f"Source of `{formatted_source}` detected as `xml`.")
|
|
|
+ logger.debug(f"Source of `{formatted_source}` detected as `xml`.")
|
|
|
return DataType.XML
|
|
|
|
|
|
if source.endswith(".mdx") or source.endswith(".md"):
|
|
|
- logging.debug(f"Source of `{formatted_source}` detected as `mdx`.")
|
|
|
+ logger.debug(f"Source of `{formatted_source}` detected as `mdx`.")
|
|
|
return DataType.MDX
|
|
|
|
|
|
if source.endswith(".txt"):
|
|
|
- logging.debug(f"Source of `{formatted_source}` detected as `text`.")
|
|
|
+ logger.debug(f"Source of `{formatted_source}` detected as `text`.")
|
|
|
return DataType.TEXT_FILE
|
|
|
|
|
|
if source.endswith(".pdf"):
|
|
|
- logging.debug(f"Source of `{formatted_source}` detected as `pdf_file`.")
|
|
|
+ logger.debug(f"Source of `{formatted_source}` detected as `pdf_file`.")
|
|
|
return DataType.PDF_FILE
|
|
|
|
|
|
if source.endswith(".yaml"):
|
|
|
with open(source, "r") as file:
|
|
|
yaml_content = yaml.safe_load(file)
|
|
|
if is_openapi_yaml(yaml_content):
|
|
|
- logging.debug(f"Source of `{formatted_source}` detected as `openapi`.")
|
|
|
+ logger.debug(f"Source of `{formatted_source}` detected as `openapi`.")
|
|
|
return DataType.OPENAPI
|
|
|
else:
|
|
|
- logging.error(
|
|
|
+ logger.error(
|
|
|
f"Source of `{formatted_source}` does not contain all the required \
|
|
|
fields of OpenAPI yaml. Check 'https://spec.openapis.org/oas/v3.1.0'"
|
|
|
)
|
|
@@ -340,11 +342,11 @@ def detect_datatype(source: Any) -> DataType:
|
|
|
)
|
|
|
|
|
|
if source.endswith(".json"):
|
|
|
- logging.debug(f"Source of `{formatted_source}` detected as `json`.")
|
|
|
+ logger.debug(f"Source of `{formatted_source}` detected as `json`.")
|
|
|
return DataType.JSON
|
|
|
|
|
|
if os.path.exists(source) and is_readable(open(source).read()):
|
|
|
- logging.debug(f"Source of `{formatted_source}` detected as `text_file`.")
|
|
|
+ logger.debug(f"Source of `{formatted_source}` detected as `text_file`.")
|
|
|
return DataType.TEXT_FILE
|
|
|
|
|
|
# If the source is a valid file, that's not detectable as a type, an error is raised.
|
|
@@ -360,11 +362,11 @@ def detect_datatype(source: Any) -> DataType:
|
|
|
|
|
|
# check if the source is valid json string
|
|
|
if is_valid_json_string(source):
|
|
|
- logging.debug(f"Source of `{formatted_source}` detected as `json`.")
|
|
|
+ logger.debug(f"Source of `{formatted_source}` detected as `json`.")
|
|
|
return DataType.JSON
|
|
|
|
|
|
# Use text as final fallback.
|
|
|
- logging.debug(f"Source of `{formatted_source}` detected as `text`.")
|
|
|
+ logger.debug(f"Source of `{formatted_source}` detected as `text`.")
|
|
|
return DataType.TEXT
|
|
|
|
|
|
|