Module tinytroupe.extraction.artifact_exporter
Expand source code
import os
import json
import pandas as pd
import pypandoc
import markdown
from typing import Union, List
from tinytroupe.extraction import logger
from tinytroupe.utils import JsonSerializableRegistry
import tinytroupe.utils as utils
class ArtifactExporter(JsonSerializableRegistry):
"""
An artifact exporter is responsible for exporting artifacts from TinyTroupe elements, for example
in order to create synthetic data files from simulations.
"""
def __init__(self, base_output_folder:str) -> None:
self.base_output_folder = base_output_folder
def export(self, artifact_name:str, artifact_data:Union[dict, str], content_type:str, content_format:str=None, target_format:str="txt", verbose:bool=False):
"""
Exports the specified artifact data to a file.
Args:
artifact_name (str): The name of the artifact.
artifact_data (Union[dict, str]): The data to export. If a dict is given, it will be saved as JSON.
If a string is given, it will be saved as is.
content_type (str): The type of the content within the artifact.
content_format (str, optional): The format of the content within the artifact (e.g., md, csv, etc). Defaults to None.
target_format (str): The format to export the artifact to (e.g., json, txt, docx, etc).
verbose (bool, optional): Whether to print debug messages. Defaults to False.
"""
# dedent inputs, just in case
if isinstance(artifact_data, str):
artifact_data = utils.dedent(artifact_data)
elif isinstance(artifact_data, dict):
artifact_data['content'] = utils.dedent(artifact_data['content'])
else:
raise ValueError("The artifact data must be either a string or a dictionary.")
# clean the artifact name of invalid characters
invalid_chars = ['/', '\\', ':', '*', '?', '"', '<', '>', '|', '\n', '\t', '\r', ';']
for char in invalid_chars:
# check if the character is in the artifact name
if char in artifact_name:
# replace the character with an underscore
artifact_name = artifact_name.replace(char, "-")
logger.warning(f"Replaced invalid character {char} with hyphen in artifact name '{artifact_name}'.")
artifact_file_path = self._compose_filepath(artifact_data, artifact_name, content_type, target_format, verbose)
if target_format == "json":
self._export_as_json(artifact_file_path, artifact_data, content_type, verbose)
elif target_format == "txt" or target_format == "text" or target_format == "md" or target_format == "markdown":
self._export_as_txt(artifact_file_path, artifact_data, content_type, verbose)
elif target_format == "docx":
self._export_as_docx(artifact_file_path, artifact_data, content_format, verbose)
else:
raise ValueError(f"Unsupported target format: {target_format}.")
def _export_as_txt(self, artifact_file_path:str, artifact_data:Union[dict, str], content_type:str, verbose:bool=False):
"""
Exports the specified artifact data to a text file.
"""
with open(artifact_file_path, 'w', encoding="utf-8") as f:
if isinstance(artifact_data, dict):
content = artifact_data['content']
else:
content = artifact_data
f.write(content)
def _export_as_json(self, artifact_file_path:str, artifact_data:Union[dict, str], content_type:str, verbose:bool=False):
"""
Exports the specified artifact data to a JSON file.
"""
with open(artifact_file_path, 'w', encoding="utf-8") as f:
if isinstance(artifact_data, dict):
json.dump(artifact_data, f, indent=4)
else:
raise ValueError("The artifact data must be a dictionary to export to JSON.")
def _export_as_docx(self, artifact_file_path:str, artifact_data:Union[dict, str], content_original_format:str, verbose:bool=False):
"""
Exports the specified artifact data to a DOCX file.
"""
# original format must be 'text' or 'markdown'
if content_original_format not in ['text', 'txt', 'markdown', 'md']:
raise ValueError(f"The original format cannot be {content_original_format} to export to DOCX.")
else:
# normalize content value
content_original_format = 'markdown' if content_original_format == 'md' else content_original_format
# first, get the content to export. If `artifact_date` is a dict, the contant should be under the key `content`.
# If it is a string, the content is the string itself.
# using pypandoc
if isinstance(artifact_data, dict):
content = artifact_data['content']
else:
content = artifact_data
# first, convert to HTML. This is necessary because pypandoc does not support a GOOD direct conversion from markdown to DOCX.
html_content = markdown.markdown(content)
## write this intermediary HTML to file
#html_file_path = artifact_file_path.replace(".docx", ".html")
#with open(html_file_path, 'w', encoding="utf-8") as f:
# f.write(html_content)
# then, convert to DOCX
pypandoc.convert_text(html_content, 'docx', format='html', outputfile=artifact_file_path)
###########################################################
# IO
###########################################################
def _compose_filepath(self, artifact_data:Union[dict, str], artifact_name:str, content_type:str, target_format:str=None, verbose:bool=False):
"""
Composes the file path for the artifact to export.
Args:
artifact_data (Union[dict, str]): The data to export.
artifact_name (str): The name of the artifact.
content_type (str): The type of the content within the artifact.
content_format (str, optional): The format of the content within the artifact (e.g., md, csv, etc). Defaults to None.
verbose (bool, optional): Whether to print debug messages. Defaults to False.
"""
# Extension definition:
#
# - If the content format is specified, we use it as the part of the extension.
# - If artificat_data is a dict, we add .json to the extension. Note that if content format was specified, we'd get <content_format>.json.
# - If artifact_data is a string and no content format is specified, we add .txt to the extension.
extension = None
if target_format is not None:
extension = f"{target_format}"
elif isinstance(artifact_data, str) and target_format is None:
extension = "txt"
# content type definition
if content_type is None:
subfolder = ""
else:
subfolder = content_type
# save to the specified file name or path, considering the base output folder.
artifact_file_path = os.path.join(self.base_output_folder, subfolder, f"{artifact_name}.{extension}")
# create intermediate directories if necessary
os.makedirs(os.path.dirname(artifact_file_path), exist_ok=True)
return artifact_file_path
Classes
class ArtifactExporter (base_output_folder: str)
-
An artifact exporter is responsible for exporting artifacts from TinyTroupe elements, for example in order to create synthetic data files from simulations.
Expand source code
class ArtifactExporter(JsonSerializableRegistry): """ An artifact exporter is responsible for exporting artifacts from TinyTroupe elements, for example in order to create synthetic data files from simulations. """ def __init__(self, base_output_folder:str) -> None: self.base_output_folder = base_output_folder def export(self, artifact_name:str, artifact_data:Union[dict, str], content_type:str, content_format:str=None, target_format:str="txt", verbose:bool=False): """ Exports the specified artifact data to a file. Args: artifact_name (str): The name of the artifact. artifact_data (Union[dict, str]): The data to export. If a dict is given, it will be saved as JSON. If a string is given, it will be saved as is. content_type (str): The type of the content within the artifact. content_format (str, optional): The format of the content within the artifact (e.g., md, csv, etc). Defaults to None. target_format (str): The format to export the artifact to (e.g., json, txt, docx, etc). verbose (bool, optional): Whether to print debug messages. Defaults to False. """ # dedent inputs, just in case if isinstance(artifact_data, str): artifact_data = utils.dedent(artifact_data) elif isinstance(artifact_data, dict): artifact_data['content'] = utils.dedent(artifact_data['content']) else: raise ValueError("The artifact data must be either a string or a dictionary.") # clean the artifact name of invalid characters invalid_chars = ['/', '\\', ':', '*', '?', '"', '<', '>', '|', '\n', '\t', '\r', ';'] for char in invalid_chars: # check if the character is in the artifact name if char in artifact_name: # replace the character with an underscore artifact_name = artifact_name.replace(char, "-") logger.warning(f"Replaced invalid character {char} with hyphen in artifact name '{artifact_name}'.") artifact_file_path = self._compose_filepath(artifact_data, artifact_name, content_type, target_format, verbose) if target_format == "json": self._export_as_json(artifact_file_path, artifact_data, content_type, verbose) elif target_format == "txt" or target_format == "text" or target_format == "md" or target_format == "markdown": self._export_as_txt(artifact_file_path, artifact_data, content_type, verbose) elif target_format == "docx": self._export_as_docx(artifact_file_path, artifact_data, content_format, verbose) else: raise ValueError(f"Unsupported target format: {target_format}.") def _export_as_txt(self, artifact_file_path:str, artifact_data:Union[dict, str], content_type:str, verbose:bool=False): """ Exports the specified artifact data to a text file. """ with open(artifact_file_path, 'w', encoding="utf-8") as f: if isinstance(artifact_data, dict): content = artifact_data['content'] else: content = artifact_data f.write(content) def _export_as_json(self, artifact_file_path:str, artifact_data:Union[dict, str], content_type:str, verbose:bool=False): """ Exports the specified artifact data to a JSON file. """ with open(artifact_file_path, 'w', encoding="utf-8") as f: if isinstance(artifact_data, dict): json.dump(artifact_data, f, indent=4) else: raise ValueError("The artifact data must be a dictionary to export to JSON.") def _export_as_docx(self, artifact_file_path:str, artifact_data:Union[dict, str], content_original_format:str, verbose:bool=False): """ Exports the specified artifact data to a DOCX file. """ # original format must be 'text' or 'markdown' if content_original_format not in ['text', 'txt', 'markdown', 'md']: raise ValueError(f"The original format cannot be {content_original_format} to export to DOCX.") else: # normalize content value content_original_format = 'markdown' if content_original_format == 'md' else content_original_format # first, get the content to export. If `artifact_date` is a dict, the contant should be under the key `content`. # If it is a string, the content is the string itself. # using pypandoc if isinstance(artifact_data, dict): content = artifact_data['content'] else: content = artifact_data # first, convert to HTML. This is necessary because pypandoc does not support a GOOD direct conversion from markdown to DOCX. html_content = markdown.markdown(content) ## write this intermediary HTML to file #html_file_path = artifact_file_path.replace(".docx", ".html") #with open(html_file_path, 'w', encoding="utf-8") as f: # f.write(html_content) # then, convert to DOCX pypandoc.convert_text(html_content, 'docx', format='html', outputfile=artifact_file_path) ########################################################### # IO ########################################################### def _compose_filepath(self, artifact_data:Union[dict, str], artifact_name:str, content_type:str, target_format:str=None, verbose:bool=False): """ Composes the file path for the artifact to export. Args: artifact_data (Union[dict, str]): The data to export. artifact_name (str): The name of the artifact. content_type (str): The type of the content within the artifact. content_format (str, optional): The format of the content within the artifact (e.g., md, csv, etc). Defaults to None. verbose (bool, optional): Whether to print debug messages. Defaults to False. """ # Extension definition: # # - If the content format is specified, we use it as the part of the extension. # - If artificat_data is a dict, we add .json to the extension. Note that if content format was specified, we'd get <content_format>.json. # - If artifact_data is a string and no content format is specified, we add .txt to the extension. extension = None if target_format is not None: extension = f"{target_format}" elif isinstance(artifact_data, str) and target_format is None: extension = "txt" # content type definition if content_type is None: subfolder = "" else: subfolder = content_type # save to the specified file name or path, considering the base output folder. artifact_file_path = os.path.join(self.base_output_folder, subfolder, f"{artifact_name}.{extension}") # create intermediate directories if necessary os.makedirs(os.path.dirname(artifact_file_path), exist_ok=True) return artifact_file_path
Ancestors
Methods
def export(self, artifact_name: str, artifact_data: Union[str, dict], content_type: str, content_format: str = None, target_format: str = 'txt', verbose: bool = False)
-
Exports the specified artifact data to a file.
Args
artifact_name
:str
- The name of the artifact.
artifact_data
:Union[dict, str]
- The data to export. If a dict is given, it will be saved as JSON. If a string is given, it will be saved as is.
content_type
:str
- The type of the content within the artifact.
content_format
:str
, optional- The format of the content within the artifact (e.g., md, csv, etc). Defaults to None.
target_format
:str
- The format to export the artifact to (e.g., json, txt, docx, etc).
verbose
:bool
, optional- Whether to print debug messages. Defaults to False.
Expand source code
def export(self, artifact_name:str, artifact_data:Union[dict, str], content_type:str, content_format:str=None, target_format:str="txt", verbose:bool=False): """ Exports the specified artifact data to a file. Args: artifact_name (str): The name of the artifact. artifact_data (Union[dict, str]): The data to export. If a dict is given, it will be saved as JSON. If a string is given, it will be saved as is. content_type (str): The type of the content within the artifact. content_format (str, optional): The format of the content within the artifact (e.g., md, csv, etc). Defaults to None. target_format (str): The format to export the artifact to (e.g., json, txt, docx, etc). verbose (bool, optional): Whether to print debug messages. Defaults to False. """ # dedent inputs, just in case if isinstance(artifact_data, str): artifact_data = utils.dedent(artifact_data) elif isinstance(artifact_data, dict): artifact_data['content'] = utils.dedent(artifact_data['content']) else: raise ValueError("The artifact data must be either a string or a dictionary.") # clean the artifact name of invalid characters invalid_chars = ['/', '\\', ':', '*', '?', '"', '<', '>', '|', '\n', '\t', '\r', ';'] for char in invalid_chars: # check if the character is in the artifact name if char in artifact_name: # replace the character with an underscore artifact_name = artifact_name.replace(char, "-") logger.warning(f"Replaced invalid character {char} with hyphen in artifact name '{artifact_name}'.") artifact_file_path = self._compose_filepath(artifact_data, artifact_name, content_type, target_format, verbose) if target_format == "json": self._export_as_json(artifact_file_path, artifact_data, content_type, verbose) elif target_format == "txt" or target_format == "text" or target_format == "md" or target_format == "markdown": self._export_as_txt(artifact_file_path, artifact_data, content_type, verbose) elif target_format == "docx": self._export_as_docx(artifact_file_path, artifact_data, content_format, verbose) else: raise ValueError(f"Unsupported target format: {target_format}.")
Inherited members