# Copyright 2022 Cerebras Systems.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import glob
import io
import json
import logging
import os
import random
import warnings
import pandas as pd
from PIL import Image
[docs]def parse_arguments():
parser = argparse.ArgumentParser("Pre-process LLaVa datasets")
subparsers = parser.add_subparsers(dest="dataset")
# subparser for ai2d datset
preprocess_ai2d = subparsers.add_parser(
"ai2d", help="Pre-process AI2D dataset"
)
preprocess_ai2d.add_argument(
"--question_dir",
type=str,
required=True,
help="Path to the AI2D question directory, which contains json files describing the question and answer corresponding to an image.",
)
preprocess_ai2d.add_argument(
"--output_jsonl_dir",
type=str,
required=True,
help="Folder to write the AI2D output jsonl files, which is in LLaVa format describing the image and associated question and answer.",
)
# subparser for arxivcap dataset
preprocess_arxivcap = subparsers.add_parser(
"arxivcap", help="Pre-process ArxivCAP dataset"
)
preprocess_arxivcap.add_argument(
"--input_dir",
type=str,
required=True,
help="Input directory of ArxivCAP dataset parquet files.",
)
preprocess_arxivcap.add_argument(
"--output_jsonl_dir",
type=str,
required=True,
help="Output directory of ArxivCAP processed jsonl files with LLaVa jsonl format.",
)
preprocess_arxivcap.add_argument(
"--output_parquet_dir",
type=str,
required=True,
help="Output directory of ArxivCAP processed parquet files.",
)
preprocess_arxivcap.add_argument(
"--parquet_range",
type=int,
required=True,
nargs="+",
help="Range of ArxivCAP parquet files to be selected.",
)
preprocess_arxivcap.add_argument(
"--output_image_dir",
type=str,
required=True,
help="Directory of ArxivCAP image files.",
)
preprocess_arxivcap.add_argument(
"--image_prefix",
type=str,
required=True,
help="Relative path prefix for ArxivCAP image files.",
)
# subparser for arxivqa
preprocess_arxivqa = subparsers.add_parser(
"arxivqa", help="Pre-process ArxivQA dataset"
)
preprocess_arxivqa.add_argument(
"--input_file",
type=str,
required=True,
help="Path to the ArxivQA question file, which contains the question and answer corresponding to an image.",
)
preprocess_arxivqa.add_argument(
"--output_jsonl_dir",
type=str,
required=True,
help="Folder to write the ArxivQA output jsonl files, which is in LLaVa format describing the image and associated question and answer.",
)
# subparser for chartqa
preprocess_chartqa = subparsers.add_parser(
"chartqa", help="Pre-process ChartQA dataset"
)
preprocess_chartqa.add_argument(
"--dataset_folder",
type=str,
required=True,
help="Path to the ChartQA dataset folder with the data split folders.",
)
# subparser for sp_docvqa
preprocess_sp_docvqa = subparsers.add_parser(
"sp_docvqa", help="Pre-process SP-DocVQA dataset"
)
preprocess_sp_docvqa.add_argument(
"--dataset_folder",
type=str,
required=True,
help="Path to the SP-DocVQA dataset folder with the data files.",
)
# subparser for infographics_docvqa
preprocess_infographics_docvqa = subparsers.add_parser(
"infographics_docvqa", help="Pre-process Infographics-DocVQA dataset"
)
preprocess_infographics_docvqa.add_argument(
"--dataset_folder",
type=str,
required=True,
help="Path to the Inforgraphics-DocVQA dataset folder with the data files.",
)
# subparser for dvqa
preprocess_dvqa = subparsers.add_parser(
"dvqa", help="Pre-process DVQA dataset"
)
preprocess_dvqa.add_argument(
"--dataset_folder",
type=str,
required=True,
help="Path to the DVQA dataset folder with the data files.",
)
# subparser for synthdog_en
preprocess_synthdog_en = subparsers.add_parser(
"synthdog_en", help="Pre-process Synthdog_EN dataset"
)
preprocess_synthdog_en.add_argument(
"--input_dir",
type=str,
required=True,
help="Input directory of Synthdog-EN dataset parquet files.",
)
preprocess_synthdog_en.add_argument(
"--output_jsonl_dir",
type=str,
required=True,
help="Output directory of Synthdog-EN processed json files with LLaVa jsonl format.",
)
preprocess_synthdog_en.add_argument(
"--output_parquet_dir",
type=str,
required=True,
help="Output directory of Synthdog-EN processed parquet files.",
)
preprocess_synthdog_en.add_argument(
"--parquet_range",
type=int,
required=True,
nargs="+",
help="Range of Synthdog-EN parquet files to be selected.",
)
preprocess_synthdog_en.add_argument(
"--output_image_dir",
type=str,
required=True,
help="Directory of Synthdog-EN image files.",
)
preprocess_synthdog_en.add_argument(
"--image_prefix",
type=str,
required=True,
help="Relative path prefix for Synthdog-EN image files.",
)
# subparser for simply converting from json to jsonl
preprocess_json_to_jsonl = subparsers.add_parser(
"convert_json2jsonl", help="Pre-process json files to jsonl files"
)
preprocess_json_to_jsonl.add_argument(
"--input_dir",
type=str,
required=True,
help="Path to a folder of json files that need to be converted to jsonl format.",
)
# returned parsed arguments
args = parser.parse_args()
return args
[docs]def convert_json_to_jsonl(new_data):
output_folder = f"{input_folder}_to_jsonl"
if not os.path.exists(output_folder):
os.makedirs(output_folder)
json_files = list(glob.glob(os.path.join(input_folder, "*.json")))
for jfile in json_files:
out_file = os.path.join(
output_folder, os.path.basename(jfile).replace("json", "jsonl")
)
logging.info(f"Processing {jfile} -> {out_file}")
with open(jfile, "r") as fh:
data = json.load(fh)
# Convert and save to JSONL
with open(out_file, "w") as jsonl_file:
for entry in data:
jsonl_file.write(json.dumps(entry) + "\n")
logging.info(f"--- jsonl files saved at {output_folder} ---")
[docs]def process_ai2d(args):
question_dir = args.question_dir
output_jsonl_dir = args.output_jsonl_dir
if not os.path.exists(output_jsonl_dir):
os.makedirs(output_jsonl_dir, exist_ok=False)
input_file_list = os.listdir(question_dir)
label_list = ["a", "b", "c", "d"]
def process_options(options):
ret_options = []
for idx in range(len(options)):
ret_options.append(f"{label_list[idx]}) {options[idx]}")
return ret_options
def get_user_string(question, options):
option_str = " ".join(options)
return "<image>\n" + question + " " + option_str
new_data = []
for input_file in input_file_list:
filename = os.path.join(question_dir, input_file)
with open(filename, "r") as json_file:
data = json.load(json_file)
image_name = f"ai2d/images/{data['imageName']}"
for quest in data["questions"].keys():
options = process_options(data["questions"][quest]["answerTexts"])
new_d = {
"id": data["questions"][quest]["questionId"],
"image": image_name,
"conversations": [
{
"from": "human",
"value": get_user_string(quest, options),
},
{
"from": "gpt",
"value": options[
data["questions"][quest]["correctAnswer"]
],
},
],
}
new_data.append(new_d)
out_file = os.path.join(output_jsonl_dir, "ai2d_llava.jsonl")
with open(out_file, "w") as jsonl_file:
for entry in new_data:
jsonl_file.write(json.dumps(entry) + "\n")
logging.info(f"--- jsonl files saved at {output_jsonl_dir} ---")
[docs]def process_arxivcap(args):
# Only handling single-figure captioning for now
question_dict = {
"Single-Figure Captioning": "Create a caption for the provided figure.",
"Multiple-Figure Captioning": "Create a caption for the provided figures.",
"Title Generation": "According to the figures and captions, generate a title for this paper. Title:",
"Contextualized Captioning": None, # depends on the figure type
}
def preprocess_parquet_to_llava(
in_filename,
out_jsonl_filename,
out_parquet_fname,
image_foldername,
img_relpath_prefix,
):
logging.info(f"preprocessing: {in_filename}")
data = pd.read_parquet(in_filename)
# write id
data["id"] = data.index
data["id"] = data["id"].apply(lambda x: "{:07d}".format(x))
def convert_to_llava(caption_images):
img_with_subcaption, img_single = 0, 0
llava_samples = []
for caption_img in caption_images:
image_path = caption_img["cil_pairs"][0]["image_file"]
if len(caption_img["cil_pairs"]) == 1:
image_filename = os.path.join(
img_relpath_prefix, image_path
)
caption = caption_img["caption"]
out = {
"id": image_path.split("/")[1][: -len(".jpg")],
"image": image_filename,
"conversations": [
{"from": "human", "value": None},
{"from": "gpt", "value": None},
],
}
conversations = out["conversations"]
question = question_dict["Single-Figure Captioning"]
conversations[0]["value"] = f"<image>\n{question}"
conversations[1]["value"] = caption
llava_samples.append(out)
img_single += 1
else:
for subcaption in caption_img["cil_pairs"]:
img_with_subcaption += 1
if not os.path.exists(
os.path.join(image_foldername, image_path.split("/")[0])
):
os.makedirs(
os.path.join(image_foldername, image_path.split("/")[0])
)
for img in caption_img["cil_pairs"]:
image_name = os.path.join(
image_foldername, img["image_file"]
)
image = Image.open(io.BytesIO(img["image"]["bytes"]))
image.save(image_name)
return llava_samples
data["llava"] = data.apply(
lambda x: convert_to_llava(x.caption_images), axis=1
)
logging.info(f"Writing preprocessed parquet")
data.to_parquet(out_parquet_fname, compression=None)
with open(out_jsonl_filename, "w") as jsonl_file:
for entry in data["llava"].tolist():
jsonl_file.write(json.dumps(entry) + "\n")
input_dir = args.input_dir
parquet_range = args.parquet_range
all_files = glob.glob(os.path.join(input_dir, "*.parquet"))
all_files = sorted(
all_files, key=lambda x: int(os.path.basename(x).split("_")[2])
)
def file_filter(x, range):
bname = os.path.basename(x)
f_month = int(bname.split("_")[2])
if range[0] <= f_month < range[1]:
return True
else:
False
select_files = list(
filter(lambda x: file_filter(x, parquet_range), all_files)
)
logging.info("selected_files:", len(select_files))
if not os.path.exists(args.output_jsonl_dir):
os.makedirs(args.output_jsonl_dir)
if not os.path.exists(args.output_parquet_dir):
os.makedirs(args.output_parquet_dir)
for file in select_files:
logging.info(f"---------- Parsing file: {file} ----------")
output_jsonl_fname = os.path.basename(file).replace(
".parquet", ".jsonl"
)
out_jsonl_filename = os.path.join(
args.output_jsonl_dir, output_jsonl_fname
)
out_parquet_fname = os.path.join(
args.output_parquet_dir,
output_jsonl_fname.replace("jsonl", "parquet"),
)
logging.info(f"in_filename: {file}")
logging.info(f"out_jsonl_filename: {out_jsonl_filename}")
logging.info(f"out_parquet_filename: {out_parquet_fname}")
logging.info(f"image_foldername: {args.output_image_dir}")
logging.info(f"img_relpath_prefix: {args.image_prefix}")
preprocess_parquet_to_llava(
in_filename=file,
out_jsonl_filename=out_jsonl_filename,
out_parquet_fname=out_parquet_fname,
image_foldername=args.output_image_dir,
img_relpath_prefix=args.image_prefix,
)
logging.info(f"--- jsonl files saved at {args.output_jsonl_dir} ---")
[docs]def process_arxivqa(args):
input_file = args.input_file
output_jsonl_dir = args.output_jsonl_dir
if not os.path.exists(output_jsonl_dir):
os.makedirs(output_jsonl_dir, exist_ok=False)
# Load your JSONL file
with open(input_file, "r") as jsonl_file:
json_list = list(jsonl_file)
def get_user_string(question, options):
option_str = " ".join(options)
return "<image>\n" + question + " " + option_str
label_dict = {
"A": 0,
"B": 1,
"C": 2,
"D": 3,
"E": 4,
"F": 5,
"G": 6,
"H": 7,
"I": 8,
}
def get_gpt_string(options, label, rationale):
# No response
if label == "" and rationale == "":
return None
# Label of type "[xxxxxx]"
elif not (label[0] in label_dict.keys()):
return label + " " + rationale
# Label of type "A"
else:
if label[0] in label_dict.keys():
# Apparently there are labels that are beyond options...
if label_dict[label[0]] >= len(options):
return None
else:
label_str = options[label_dict[label[0]]]
return label_str + " " + rationale
else:
warnings.warn(
"This sample's label is not part of the label_dict. Ignoring this sample."
)
"""
# Note: some options don"t have A/B/C/D and some options has format A) or A.
# Labels may or may not contain full option string and are not consistent.
# As a cleanup step, we will replace the label with the full text of the option,
# regardless of the format for the options.
"""
new_data = []
for idx, d in enumerate(json_list):
d = json.loads(d)
new_d = {
"id": d["id"],
"image": f"ArxivQA/{d['image']}",
"conversations": [
{
"from": "human",
"value": get_user_string(d["question"], d["options"]),
},
{
"from": "gpt",
"value": get_gpt_string(
d["options"], d["label"], d["rationale"]
),
},
],
}
if new_d["conversations"][1]["value"] is not None:
new_data.append(new_d)
out_file = os.path.join(output_jsonl_dir, "arxivqa_llava.jsonl")
with open(out_file, "w") as jsonl_file:
for entry in new_data:
jsonl_file.write(json.dumps(entry) + "\n")
logging.info(f"--- jsonl files saved at {output_jsonl_dir} ---")
[docs]def process_chartqa(args):
def generate(split, subset):
input_file = f"{args.dataset_folder}/{split}/{split}_{subset}.json"
output_file = f"{args.dataset_folder}/{split}/{split}_{subset}_llava_jsonl/{split}_{subset}_llava.jsonl"
output_jsonl_dir = os.path.dirname(output_file)
if not os.path.exists(output_jsonl_dir):
os.makedirs(output_jsonl_dir, exist_ok=False)
# Load your JSON file
with open(input_file, "r") as json_file:
data = json.load(json_file)
new_data = []
for idx, d in enumerate(data):
new_d = {
"id": idx,
"image": f"ChartQA_Dataset/{split}/png/{d['imgname']}",
"conversations": [
{
"from": "human",
"value": f"<image>\n{d['query']}",
},
{
"from": "gpt",
"value": d["label"],
},
],
}
new_data.append(new_d)
with open(output_file, "w") as jsonl_file:
for entry in new_data:
jsonl_file.write(json.dumps(entry) + "\n")
logging.info(f"--- jsonl files saved at {output_jsonl_dir} ---")
for split in ["train", "val", "test"]:
for subset in ["human", "augmented"]:
generate(split, subset)
[docs]def process_sp_docvqa(args):
def generate(split):
input_file = f"{args.dataset_folder}/{split}.json"
output_file = (
f"{args.dataset_folder}/{split}_llava_jsonl/{split}_llava.jsonl"
)
output_jsonl_dir = os.path.dirname(output_file)
if not os.path.exists(output_jsonl_dir):
os.makedirs(output_jsonl_dir, exist_ok=False)
new_data = []
with open(input_file, "r") as json_file:
data = json.load(json_file)["data"]
for quest in data:
image_name = quest["image"].split("/")[-1]
new_d = {
"id": quest["questionId"],
"image": f"DocVQA/sp_docvqa/images/{image_name}",
"conversations": [
{
"from": "human",
"value": f"<image>\n{quest['question']}",
},
{
"from": "gpt",
# only use the first answer
"value": quest["answers"][0],
},
],
}
new_data.append(new_d)
with open(output_file, "w") as jsonl_file:
for entry in new_data:
jsonl_file.write(json.dumps(entry) + "\n")
logging.info(f"--- jsonl files saved at {output_jsonl_dir} ---")
for split in ["train_v1.0_withQT", "val_v1.0_withQT"]:
generate(split)
[docs]def process_infographics_docvqa(args):
def generate(split):
input_file = f"{args.dataset_folder}/infographicsVQA_{split}.json"
output_file = f"{args.dataset_folder}/infographicsVQA_{split}_llava_jsonl/infographicsVQA_{split}_llava.jsonl"
output_jsonl_dir = os.path.dirname(output_file)
if not os.path.exists(output_jsonl_dir):
os.makedirs(output_jsonl_dir, exist_ok=False)
new_data = []
with open(input_file, "r") as json_file:
data = json.load(json_file)["data"]
for quest in data:
new_d = {
"id": quest["questionId"],
"image": f"DocVQA/Infographicsvqa/images/{quest['image_local_name']}",
"conversations": [
{
"from": "human",
"value": f"<image>\n{quest['question']}",
},
{
"from": "gpt",
# only use the first answer
"value": quest["answers"][0],
},
],
}
new_data.append(new_d)
with open(output_file, "w") as jsonl_file:
for entry in new_data:
jsonl_file.write(json.dumps(entry) + "\n")
logging.info(f"--- jsonl files saved at {output_jsonl_dir} ---")
for split in ["train_v1.0", "val_v1.0_withQT"]:
generate(split)
[docs]def process_dvqa(args):
# "train_qa" #"val_easy_qa" #"val_hard_qa"
subset = "train_qa"
input_file = f"{args.dataset_folder}/{subset}.json"
output_file = (
f"{args.dataset_folder}/{subset}_llava_jsonl/{subset}_llava.jsonl"
)
output_jsonl_dir = os.path.dirname(output_file)
if not os.path.exists(output_jsonl_dir):
os.makedirs(output_jsonl_dir, exist_ok=False)
# Load your JSON file
with open(input_file, "r") as json_file:
data = json.load(json_file)
new_data = []
for idx, d in enumerate(data):
new_d = {
"id": d["question_id"],
"image": f"DVQA/images/{d['image']}",
"conversations": [
{
"from": "human",
"value": f"<image>\n{d['question']}",
},
{
"from": "gpt",
"value": d["answer"],
},
],
}
new_data.append(new_d)
with open(output_file, "w") as jsonl_file:
for entry in new_data:
jsonl_file.write(json.dumps(entry) + "\n")
logging.info(f"--- jsonl files saved at {output_jsonl_dir} ---")
[docs]def process_synthdog_en(args):
question_list = [
"Describe the image concisely.",
"Provide a brief description of the given image.",
"Offer a succinct explanation of the picture presented.",
"Summarize the visual content of the image.",
"Give a short and clear explanation of the subsequent image.",
"Share a concise interpretation of the image provided.",
"Present a compact description of the photo's key features.",
"Relay a brief, clear account of the picture shown.",
"Render a clear and concise summary of the photo.",
"Write a terse but informative summary of the picture.",
"Create a compact narrative representing the image presented.",
]
def preprocess_parquet_to_llava(
in_filename,
out_jsonl_filename,
out_parquet_fname,
image_foldername,
img_relpath_prefix,
):
logging.info(f"preprocessing: {in_filename}")
data = pd.read_parquet(in_filename)
# write id
data["id"] = data.index
data["id"] = data["id"].apply(lambda x: "{:07d}".format(x))
def convert_to_llava(id, ground_truth):
out = {
"id": id,
"image": os.path.join(img_relpath_prefix, f"{id}.png"),
"conversations": [
{"from": "human", "value": None},
{"from": "gpt", "value": None},
],
}
ground_truth = eval(ground_truth)
conversations = out["conversations"]
question_idx = random.randint(0, len(question_list) - 1)
question = question_list[question_idx]
conversations[0]["value"] = f"<image>\n{question}"
conversations[1]["value"] = ground_truth["gt_parse"][
"text_sequence"
]
return out
def save_image(id, image):
image = Image.open(io.BytesIO(image["bytes"]))
p = os.path.join(image_foldername, f"{id}.png")
image.save(p)
data["llava"] = data.apply(
lambda x: convert_to_llava(x.id, x.ground_truth), axis=1
)
logging.info(f"Writing preprocessed parquet")
data.to_parquet(out_parquet_fname, compression=None)
with open(out_jsonl_filename, "w") as jsonl_file:
for entry in data["llava"].tolist():
jsonl_file.write(json.dumps(entry) + "\n")
logging.info(f"Saving images now")
data.apply(lambda x: save_image(x.id, x.image), axis=1)
logging.info(f"DONE: saving images")
input_dir = args.input_dir
parquet_range = args.parquet_range
all_files = glob.glob(os.path.join(input_dir, "*.parquet"))
all_files = sorted(
all_files, key=lambda x: int(os.path.basename(x).split("-")[1])
)
def file_filter(x, range):
bname = os.path.basename(x)
fnum = int(bname.split("-")[1])
if range[0] <= fnum < range[1]:
return True
else:
False
select_files = list(
filter(lambda x: file_filter(x, parquet_range), all_files)
)
logging.info(f"selected_files: {select_files}")
if not os.path.exists(args.output_jsonl_dir):
os.makedirs(args.output_jsonl_dir)
if not os.path.exists(args.output_parquet_dir):
os.makedirs(args.output_parquet_dir)
for file in select_files:
logging.info(f"---------- Parsing file: {file} ----------")
output_jsonl_fname = os.path.basename(file).replace(".parquet", ".json")
out_jsonl_filename = os.path.join(
args.output_jsonl_dir, output_jsonl_fname
)
out_parquet_fname = os.path.join(
args.output_parquet_dir,
output_jsonl_fname.replace("json", "parquet"),
)
splits = os.path.basename(file).split("-")
image_subdir = os.path.join(args.output_image_dir, splits[0], splits[1])
if not os.path.exists(image_subdir):
os.makedirs(image_subdir)
assert args.image_prefix in image_subdir
image_prefix = os.path.join(args.image_prefix, splits[0], splits[1])
logging.info(f"in_filename: {file}")
logging.info(f"out_jsonl_filename: {out_jsonl_filename}")
logging.info(f"out_parquet_filename: {out_parquet_fname}")
logging.info(f"image_foldername: {image_subdir}")
logging.info(f"img_relpath_prefix: {image_prefix}")
preprocess_parquet_to_llava(
in_filename=file,
out_jsonl_filename=out_jsonl_filename,
out_parquet_fname=out_parquet_fname,
image_foldername=image_subdir,
img_relpath_prefix=image_prefix,
)
logging.info(f"--- jsonl files saved at {args.output_jsonl_dir} ---")
if __name__ == "__main__":
args = parse_arguments()
if args.dataset == "ai2d":
process_ai2d(args)
elif args.dataset == "arxivcap":
process_arxivcap(args)
elif args.dataset == "arxivqa":
process_arxivqa(args)
elif args.dataset == "chartqa":
process_chartqa(args)
elif args.dataset == "sp_docvqa":
process_sp_docvqa(args)
elif args.dataset == "infographics_docvqa":
process_infographics_docvqa(args)
elif args.dataset == "dvqa":
process_dvqa(args)
elif args.dataset == "synthdog_en":
process_synthdog_en(args)
elif args.dataset == "convert_json2jsonl":
convert_json_to_jsonl(args.input_dir)
else:
raise ValueError(
"Dataset currently not supported. Feel free to adapt codebase to include your dataset."
)