"""Utils for data load, save, and process (e.g., prompt construction)""" import json import os import re import yaml DOMAIN_CAT2SUB_CAT = { "Art and Design": ["Art", "Art_Theory", "Design", "Music"], "Business": ["Accounting", "Economics", "Finance", "Manage", "Marketing"], "Science": [ "Biology", "Chemistry", "Geography", "Math", "Physics", ], "Health and Medicine": [ "Basic_Medical_Science", "Clinical_Medicine", "Diagnostics_and_Laboratory_Medicine", "Pharmacy", "Public_Health", ], "Humanities and Social Science": [ "History", "Literature", "Sociology", "Psychology", ], "Tech and Engineering": [ "Agriculture", "Architecture_and_Engineering", "Computer_Science", "Electronics", "Energy_and_Power", "Materials", "Mechanical_Engineering", ], } CAT_SHORT2LONG = { "acc": "Accounting", "agri": "Agriculture", "arch": "Architecture_and_Engineering", "art": "Art", "art_theory": "Art_Theory", "bas_med": "Basic_Medical_Science", "bio": "Biology", "chem": "Chemistry", "cli_med": "Clinical_Medicine", "cs": "Computer_Science", "design": "Design", "diag_med": "Diagnostics_and_Laboratory_Medicine", "econ": "Economics", "elec": "Electronics", "ep": "Energy_and_Power", "fin": "Finance", "geo": "Geography", "his": "History", "liter": "Literature", "manage": "Manage", "mark": "Marketing", "mate": "Materials", "math": "Math", "mech": "Mechanical_Engineering", "music": "Music", "phar": "Pharmacy", "phys": "Physics", "psy": "Psychology", "pub_health": "Public_Health", "socio": "Sociology", } # DATA SAVING def save_json(filename, ds): with open(filename, "w") as f: json.dump(ds, f, indent=4) def get_multi_choice_info(options): """ Given the list of options for multiple choice question Return the index2ans and all_choices """ start_chr = "A" all_choices = [] index2ans = {} for i, option in enumerate(options): index2ans[chr(ord(start_chr) + i)] = option all_choices.append(chr(ord(start_chr) + i)) return index2ans, all_choices def load_yaml(file_path): with open(file_path, "r") as stream: try: yaml_dict = yaml.safe_load(stream) except yaml.YAMLError as exc: print(exc) return yaml_dict def parse_img_path(text): matches = re.findall("", text) return matches def process_single_sample(data): question = data["question"] o_imgs_paths = [] for option in data["options"]: current_o_imgs_paths = parse_img_path(option) for img_path in current_o_imgs_paths: o_imgs_paths.append(img_path) if len(o_imgs_paths) > 1: # multiple images in options, used for random selection return { "id": data["id"], "question": question, "options": data["options"], "answer": data["answer"], "image": None, "question_type": data["question_type"], } else: return { "id": data["id"], "question": question, "options": data["options"], "answer": data["answer"], "image": data["image_1"], "question_type": data["question_type"], } # DATA SAVING def save_json(filename, ds): print(f"answers saved to: {filename}") os.makedirs(os.path.dirname(filename), exist_ok=True) with open(filename, "w") as f: json.dump(ds, f, indent=4) def save_jsonl(filename, data): """ Save a dictionary of data to a JSON Lines file with the filename as key and caption as value. Args: filename (str): The path to the file where the data should be saved. data (dict): The dictionary containing the data to save where key is the image path and value is the caption. """ with open(filename, "w", encoding="utf-8") as f: for img_path, caption in data.items(): # Extract the base filename without the extension base_filename = os.path.basename(img_path) # Create a JSON object with the filename as the key and caption as the value json_record = json.dumps({base_filename: caption}, ensure_ascii=False) # Write the JSON object to the file, one per line f.write(json_record + "\n") def save_args(args, path_dir): argsDict = args.__dict__ with open(path_dir + "setting.txt", "w") as f: f.writelines("------------------ start ------------------" + "\n") for eachArg, value in argsDict.items(): f.writelines(eachArg + " : " + str(value) + "\n") f.writelines("------------------- end -------------------") # DATA PROCESSING def construct_prompt(sample, config): question = sample["question"] options = eval(sample["options"]) example = "" if sample["question_type"] == "multiple-choice": start_chr = "A" prediction_range = [] index2ans = {} for option in options: prediction_range.append(start_chr) example += f"({start_chr}) {option}\n" index2ans[start_chr] = option start_chr = chr(ord(start_chr) + 1) empty_prompt_sample_structure = config["multi_choice_example_format"] empty_prompt = empty_prompt_sample_structure.format(question, example) res_dict = {} res_dict["index2ans"] = index2ans res_dict["correct_choice"] = sample["answer"] res_dict["all_choices"] = prediction_range res_dict["empty_prompt"] = empty_prompt if config["task_instructions"]: res_dict["final_input_prompt"] = ( config["task_instructions"].strip() + "\n\n" + empty_prompt ) else: res_dict["final_input_prompt"] = empty_prompt res_dict["gt_content"] = options[ord(sample["answer"].upper()) - ord("A")] else: empty_prompt_sample_structure = config["short_ans_example_format"] empty_prompt = empty_prompt_sample_structure.format(question) res_dict = {} res_dict["empty_prompt"] = empty_prompt if config["task_instructions"]: res_dict["final_input_prompt"] = ( config["task_instructions"].strip() + "\n\n" + empty_prompt ) else: res_dict["final_input_prompt"] = empty_prompt res_dict["gt_content"] = sample["answer"] res_dict.update(sample) return res_dict