| from uuid import uuid4 |
| from KG_classes import FileNode, ChunkNode, Property, Node, Relationship |
| from langchain_community.graphs.graph_document import ( |
| Node as BaseNode, |
| Relationship as BaseRelationship, |
| ) |
| from langchain.schema import Document |
| import os |
| from typing import List |
| from langchain.text_splitter import TokenTextSplitter |
| from langchain_community.document_loaders import TextLoader, PyPDFLoader |
|
|
|
|
| def env_parse(file): |
| credentials = {} |
| with open(file, "r") as f: |
| for line in f: |
| if line[0] == "#": |
| continue |
| line = line.split("=") |
| credentials[line[0]] = line[1].strip() |
| return credentials |
|
|
|
|
| def create_file_node(file_path: str) -> FileNode: |
| """Create a file node.""" |
| return FileNode( |
| id=file_path.split("/")[-1], |
| type="File", |
| properties=[ |
| Property(key="path", value=file_path), |
| Property(key="name", value=os.path.basename(file_path)), |
| ], |
| ) |
|
|
|
|
| def create_chunk_node( |
| chunk: Document, chunk_idx: int, file_node: FileNode |
| ) -> ChunkNode: |
| """Create a chunk node.""" |
| return ChunkNode( |
| id=file_node.id + str(chunk_idx), |
| type="Chunk", |
| properties=[ |
| Property(key="content", value=chunk.page_content), |
| Property(key="idx", value=str(chunk_idx)), |
| Property(key="sourceFileId", value=file_node.id), |
| ], |
| ) |
|
|
|
|
| def format_property_key(s: str) -> str: |
| words = s.split() |
| if not words: |
| return s |
| first_word = words[0].lower() |
| capitalized_words = [word.capitalize() for word in words[1:]] |
| return "".join([first_word] + capitalized_words) |
|
|
|
|
| def props_to_dict(props) -> dict: |
| """Convert properties to a dictionary.""" |
| properties = {} |
| if not props: |
| return properties |
| for p in props: |
| properties[format_property_key(p.key)] = p.value |
| return properties |
|
|
|
|
| def map_to_base_node(node: Node) -> BaseNode: |
| """Map the KnowledgeGraph Node to the base Node.""" |
| if type(node.properties) == dict: |
| properties = node.properties |
| else: |
| properties = props_to_dict(node.properties) if node.properties else {} |
| |
| properties["name"] = node.id.title() |
| return BaseNode( |
| id=node.id.title(), type=node.type.capitalize(), properties=properties |
| ) |
|
|
|
|
| def map_to_base_relationship(rel: Relationship) -> BaseRelationship: |
| """Map the KnowledgeGraph Relationship to the base Relationship.""" |
| source = map_to_base_node(rel.source) |
| target = map_to_base_node(rel.target) |
| properties = props_to_dict(rel.properties) if rel.properties else {} |
| return BaseRelationship( |
| source=source, target=target, type=rel.type.capitalize(), properties=properties |
| ) |
|
|
|
|
| def create_relationship(source: Node, target: Node, type: str): |
| source = map_to_base_node(source) |
| target = map_to_base_node(target) |
| return BaseRelationship( |
| source=source, target=target, type=type.capitalize(), properties={} |
| ) |
|
|
|
|
| def load_and_split_documents( |
| file_paths: List[str], chunk_size: int = 100, chunk_overlap: int = 20 |
| ): |
| """ |
| Load and split multiple documents into chunks. |
| |
| Args: |
| file_paths (List[str]): List of file paths to load. |
| chunk_size (int): Size of each chunk (in tokens). |
| chunk_overlap (int): Overlap between chunks (in tokens). |
| |
| Returns: |
| List: List of split document chunks. |
| """ |
| all_pages = [] |
| text_splitter = TokenTextSplitter( |
| chunk_size=chunk_size, chunk_overlap=chunk_overlap |
| ) |
|
|
| for file_path in file_paths: |
| if file_path.endswith(".pdf"): |
| loader = PyPDFLoader(file_path) |
| else: |
| loader = TextLoader(file_path) |
|
|
| pages = loader.load_and_split() |
| chunks = text_splitter.split_documents(pages) |
| all_pages.extend(chunks) |
|
|
| return all_pages |
|
|