auto schema generation
This commit is contained in:
parent
7a5b5bc9a1
commit
5535a2e21c
120
main.py
120
main.py
@ -10,7 +10,6 @@ import argparse
|
|||||||
parser = argparse.ArgumentParser()
|
parser = argparse.ArgumentParser()
|
||||||
parser.add_argument("--input", help="folder containing input json(s)", required=True, type=pathlib.Path)
|
parser.add_argument("--input", help="folder containing input json(s)", required=True, type=pathlib.Path)
|
||||||
parser.add_argument("--output", help="folder to place csv", required=True, type=pathlib.Path)
|
parser.add_argument("--output", help="folder to place csv", required=True, type=pathlib.Path)
|
||||||
parser.add_argument("--schema", help="schema json", required=True, type=pathlib.Path)
|
|
||||||
parser.add_argument("--single", action="store_true", help="merge all json files to single output csv")
|
parser.add_argument("--single", action="store_true", help="merge all json files to single output csv")
|
||||||
parser.add_argument("--metadata", type=int, help="how many records to parse for building metadata", default=1000)
|
parser.add_argument("--metadata", type=int, help="how many records to parse for building metadata", default=1000)
|
||||||
parser.add_argument("--join-column", help="join column from top-level to merge nested json", required=True)
|
parser.add_argument("--join-column", help="join column from top-level to merge nested json", required=True)
|
||||||
@ -18,8 +17,6 @@ parser.add_argument("--name", help="join column from top-level to merge nested j
|
|||||||
|
|
||||||
args = parser.parse_args()
|
args = parser.parse_args()
|
||||||
|
|
||||||
schema = json.load(open(args.schema))
|
|
||||||
|
|
||||||
|
|
||||||
class DBConn:
|
class DBConn:
|
||||||
def __init__(self):
|
def __init__(self):
|
||||||
@ -73,25 +70,42 @@ class DBConn:
|
|||||||
dbConn = DBConn()
|
dbConn = DBConn()
|
||||||
|
|
||||||
|
|
||||||
def extract_child(merge_headers, item, k, ext, prev_step):
|
def extract_child(merge_headers, items_list, top_level, prev_step, existing, step):
|
||||||
child_headers = merge_headers.get(k, [args.join_column])
|
headers = merge_headers.get(top_level, [args.join_column])
|
||||||
|
|
||||||
for child in item[k]:
|
flat_keys = existing.get("flat_keys", [])
|
||||||
|
extract_keys = existing.get("extract_keys", {})
|
||||||
|
|
||||||
|
for child_idx in range(len(items_list)):
|
||||||
|
child = items_list[child_idx]
|
||||||
for (subKey, subValue) in child.items():
|
for (subKey, subValue) in child.items():
|
||||||
is_ext = subKey in ext.get("extract_keys", {}).keys()
|
|
||||||
if is_ext:
|
|
||||||
extract_child(merge_headers, child, subKey, ext["extract_keys"][subKey], f"{prev_step}{k}_")
|
|
||||||
else:
|
|
||||||
child_header = f"{k}_{subKey}"
|
|
||||||
if child_header not in child_headers:
|
|
||||||
child_headers.append(child_header)
|
|
||||||
|
|
||||||
merge_headers[f"{prev_step}{k}"] = child_headers
|
is_dict = isinstance(subValue, dict)
|
||||||
|
is_list = isinstance(subValue, list)
|
||||||
|
|
||||||
|
if is_dict:
|
||||||
|
if subKey not in flat_keys:
|
||||||
|
flat_keys.append(subKey)
|
||||||
|
for (sub2Key, sub2Value) in subValue.items():
|
||||||
|
composite_key = f"{top_level}_{sub2Key}"
|
||||||
|
if composite_key not in headers:
|
||||||
|
headers.append(composite_key)
|
||||||
|
if is_list:
|
||||||
|
if subKey not in extract_keys.keys():
|
||||||
|
existing_next_step = extract_keys.get(subKey, {})
|
||||||
|
extract_keys[subKey] = extract_child(merge_headers, subValue, subKey, f"{prev_step}{top_level}_",
|
||||||
|
existing_next_step, step + 1)
|
||||||
|
else:
|
||||||
|
child_header = f"{top_level}_{subKey}"
|
||||||
|
if child_header not in headers:
|
||||||
|
headers.append(child_header)
|
||||||
|
|
||||||
|
merge_headers[f"{prev_step}{top_level}"] = headers
|
||||||
|
return {"flat_keys": flat_keys, "extract_keys": extract_keys}
|
||||||
|
|
||||||
|
|
||||||
def extract_child_value(merge_headers, item, k, ext, prev_step, join_col_value):
|
def extract_child_value(merge_headers, item, k, ext, prev_step, join_col_value):
|
||||||
child_value = {}
|
child_value = {args.join_column: join_col_value}
|
||||||
child_value[args.join_column] = join_col_value
|
|
||||||
for child in item[k]:
|
for child in item[k]:
|
||||||
for (subKey, subValue) in child.items():
|
for (subKey, subValue) in child.items():
|
||||||
is_ext = subKey in ext.get("extract_keys", {}).keys()
|
is_ext = subKey in ext.get("extract_keys", {}).keys()
|
||||||
@ -107,10 +121,9 @@ def extract_child_value(merge_headers, item, k, ext, prev_step, join_col_value):
|
|||||||
|
|
||||||
|
|
||||||
def parse_json():
|
def parse_json():
|
||||||
extract_keys = schema["extract_keys"]
|
extract_keys = {}
|
||||||
flat_keys = schema["flat_keys"]
|
flat_keys = []
|
||||||
|
|
||||||
extract_keys_names = extract_keys.keys()
|
|
||||||
headers = []
|
headers = []
|
||||||
merge_headers = {}
|
merge_headers = {}
|
||||||
|
|
||||||
@ -124,56 +137,71 @@ def parse_json():
|
|||||||
return
|
return
|
||||||
|
|
||||||
parsed = 0
|
parsed = 0
|
||||||
parent = args.name
|
top_level = args.name
|
||||||
|
|
||||||
# first pass, collect all headers
|
# first pass, collect all headers
|
||||||
for item in ijson.items(open(input_path / input_files[0]), "item"):
|
for topLevelItem in ijson.items(open(input_path / input_files[0]), "item"):
|
||||||
if parsed > args.metadata:
|
if parsed > args.metadata:
|
||||||
print(f"parsed {parsed} records for metadata")
|
print(f"parsed {parsed} records for metadata")
|
||||||
break
|
break
|
||||||
keys = item.keys()
|
|
||||||
for k in keys:
|
for (key, value) in topLevelItem.items():
|
||||||
if k in flat_keys:
|
value_is_dict = isinstance(value, dict)
|
||||||
for (fk, fv) in item[k].items():
|
value_is_list = isinstance(value, list)
|
||||||
composite_key = f"{k}_{fk}"
|
|
||||||
|
if value_is_dict:
|
||||||
|
if key not in flat_keys:
|
||||||
|
flat_keys.append(key)
|
||||||
|
|
||||||
|
for (subKey, subValue) in value.items():
|
||||||
|
composite_key = f"{key}_{subKey}"
|
||||||
if composite_key not in headers:
|
if composite_key not in headers:
|
||||||
headers.append(composite_key)
|
headers.append(composite_key)
|
||||||
elif k in extract_keys_names:
|
elif value_is_list:
|
||||||
ext = extract_keys[k]
|
existing = extract_keys.get(key, {})
|
||||||
extract_child(merge_headers, item, k, ext, f"{parent}_")
|
extract_keys[key] = extract_child(merge_headers, value, key, f"{top_level}_", existing, 1)
|
||||||
|
|
||||||
else:
|
else:
|
||||||
if k not in headers:
|
if key not in headers:
|
||||||
headers.append(k)
|
headers.append(key)
|
||||||
parsed += 1
|
parsed += 1
|
||||||
|
|
||||||
dbConn.make_table(parent, headers)
|
open("schema.json", "w").write(
|
||||||
|
json.dumps({
|
||||||
|
"flat_keys": flat_keys,
|
||||||
|
"extract_keys": extract_keys
|
||||||
|
}, indent=2)
|
||||||
|
)
|
||||||
|
dbConn.make_table(top_level, headers)
|
||||||
|
|
||||||
for (mhKey, mhVal) in merge_headers.items():
|
for (mhKey, mhVal) in merge_headers.items():
|
||||||
dbConn.make_table(mhKey, mhVal)
|
dbConn.make_table(mhKey, mhVal)
|
||||||
|
|
||||||
|
extract_keys_names = extract_keys.keys()
|
||||||
|
|
||||||
# second pass, make flat json from original-json, create csv ( we will use sqlite, as it is faster to write)
|
# second pass, make flat json from original-json, create csv ( we will use sqlite, as it is faster to write)
|
||||||
for inp_file in input_files:
|
for inp_file in input_files:
|
||||||
for item in ijson.items(open(input_path / inp_file), "item"):
|
for topLevelItem in ijson.items(open(input_path / inp_file), "item"):
|
||||||
keys = item.keys()
|
keys = topLevelItem.keys()
|
||||||
flat_json = {}
|
flat_json = {}
|
||||||
for k in keys:
|
for key in keys:
|
||||||
if k in flat_keys:
|
if key in flat_keys:
|
||||||
for (fk, fv) in item[k].items():
|
for (subKey, subValue) in topLevelItem[key].items():
|
||||||
composite_key = f"{k}_{fk}"
|
composite_key = f"{key}_{subKey}"
|
||||||
flat_json[composite_key] = fv
|
flat_json[composite_key] = subValue
|
||||||
elif k in extract_keys_names:
|
elif key in extract_keys_names:
|
||||||
ext = extract_keys[k]
|
ext = extract_keys[key]
|
||||||
extract_child_value(merge_headers, item, k, ext, f"{parent}_", item[args.join_column])
|
extract_child_value(merge_headers, topLevelItem, key, ext, f"{top_level}_",
|
||||||
|
topLevelItem[args.join_column])
|
||||||
else:
|
else:
|
||||||
flat_json[k] = item[k]
|
flat_json[key] = topLevelItem[key]
|
||||||
|
|
||||||
dbConn.write_to_database(parent, flat_json)
|
dbConn.write_to_database(top_level, flat_json)
|
||||||
|
|
||||||
if not args.single:
|
if not args.single:
|
||||||
dbConn.con.commit()
|
dbConn.con.commit()
|
||||||
dbConn.make_csv_from_tables(prefix=f"{pathlib.Path(args.output / inp_file).stem}-")
|
dbConn.make_csv_from_tables(prefix=f"{pathlib.Path(args.output / inp_file).stem}-")
|
||||||
dbConn.reinit_db()
|
dbConn.reinit_db()
|
||||||
dbConn.make_table(parent, headers)
|
dbConn.make_table(top_level, headers)
|
||||||
|
|
||||||
for (mhKey, mhVal) in merge_headers.items():
|
for (mhKey, mhVal) in merge_headers.items():
|
||||||
dbConn.make_table(mhKey, mhVal)
|
dbConn.make_table(mhKey, mhVal)
|
||||||
|
|||||||
55
schema.json
55
schema.json
@ -1,55 +0,0 @@
|
|||||||
{
|
|
||||||
"flat_keys": [
|
|
||||||
"cost_center",
|
|
||||||
"location",
|
|
||||||
"customer",
|
|
||||||
"event"
|
|
||||||
],
|
|
||||||
"extract_keys": {
|
|
||||||
"customer.addresses": {
|
|
||||||
"flat_keys": [],
|
|
||||||
"extract_keys": {}
|
|
||||||
},
|
|
||||||
"customer.phone_numbers": {
|
|
||||||
"flat_keys": [],
|
|
||||||
"extract_keys": {}
|
|
||||||
},
|
|
||||||
"price_modifiers": {
|
|
||||||
"flat_keys": [],
|
|
||||||
"extract_keys": {}
|
|
||||||
},
|
|
||||||
"tender_items": {
|
|
||||||
"flat_keys": [],
|
|
||||||
"extract_keys": {}
|
|
||||||
},
|
|
||||||
"fee_items": {
|
|
||||||
"flat_keys": [],
|
|
||||||
"extract_keys": {}
|
|
||||||
},
|
|
||||||
"tax_items": {
|
|
||||||
"flat_keys": [],
|
|
||||||
"extract_keys": {}
|
|
||||||
},
|
|
||||||
"loyalty": {
|
|
||||||
"flat_keys": [],
|
|
||||||
"extract_keys": {}
|
|
||||||
},
|
|
||||||
"sale_items": {
|
|
||||||
"flat_keys": [],
|
|
||||||
"extract_keys": {
|
|
||||||
"categories": {
|
|
||||||
"key": "categories"
|
|
||||||
},
|
|
||||||
"modifiers": {
|
|
||||||
"key": "modifiers",
|
|
||||||
"flat_keys": [],
|
|
||||||
"extract_keys": {
|
|
||||||
"categories": {
|
|
||||||
"key": "categories"
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
Loading…
x
Reference in New Issue
Block a user