From 72c13c353fab12b14918ab90b50b526f62dd9ff2 Mon Sep 17 00:00:00 2001 From: "gowthaman.b" Date: Wed, 21 Jun 2023 17:41:56 +0530 Subject: [PATCH] add a matching seq no --- main.py | 64 +++++++++++++++++++++++++++++++++++++++++---------------- 1 file changed, 46 insertions(+), 18 deletions(-) diff --git a/main.py b/main.py index 4e84b36..8b69ece 100644 --- a/main.py +++ b/main.py @@ -1,4 +1,5 @@ import json +import os import pathlib import time import shutil @@ -13,8 +14,9 @@ parser.add_argument("--input", help="folder containing input json(s)", required= parser.add_argument("--output", help="folder to place csv", required=True, type=pathlib.Path) parser.add_argument("--delimiter", help="delimiter for CSV (default is '|'", default="|") parser.add_argument("--single", action="store_true", help="merge all json files to single output csv") -parser.add_argument("--debug", action="store_true", help="print debug logs") +parser.add_argument("--verbose", '-v', action="count", help="set verbose level", default=0) parser.add_argument("--zip", action="store_true", help="make a zipfile of all outputs") +parser.add_argument("--clean", action="store_true", help="clear output directory") parser.add_argument("--metadata", type=int, help="how many records to parse for building metadata", default=1000) parser.add_argument("--join-column", help="join column from top-level to merge nested json", required=True) parser.add_argument("--name", help="base name, to be used in creating all data", required=True) @@ -29,6 +31,7 @@ def make_archive(source: pathlib.Path, destination: pathlib.Path) -> None: base_dir = source.name shutil.make_archive(str(base_name), fmt, root_dir, base_dir) + class DBConn: def __init__(self): self.cur = None @@ -49,7 +52,7 @@ class DBConn: syspk = "syspk integer primary key autoincrement" other_cols = ', '.join([f"\"{f}\" TEXT" for f in cols]) create_tbl_sql = f"create table if not exists \"{tbl_name}\" ({syspk}, {other_cols})" - if args.debug: + if args.verbose >= 1: print(f"create sql = {create_tbl_sql}") self.cur.execute(create_tbl_sql) @@ -75,14 +78,18 @@ class DBConn: not_found.append(col) if len(not_found) > 0: + if args.verbose >= 1: + print( + f"added new cols {', '.join(not_found)} to {tbl}, already present {tbl_cols}, want {col_names}") + + new_cols = list(tbl_cols) for new_col in not_found: self.cur.execute(f"alter table \"{tbl}\" add column \"{new_col}\" text") - - print(f"added new cols {', '.join(not_found)} to {tbl}") - self.table_mapping[tbl] = list(col_names) + new_cols.append(new_col) + self.table_mapping[tbl] = new_cols sql = f"insert into \"{tbl}\" ({col_names_placeholder}) values({value_placeholders})" - if args.debug: + if args.verbose == 2: print(f"insert sql = {sql}") self.cur.execute(sql, values) @@ -132,23 +139,31 @@ class DBConn: self.table_mapping[f"{prev_step}{current_level}"] = attributes return {"flat_attributes": flat_attributes, "children": children, "attributes": attributes} - def extract_child_value(self, item, k, ext, prev_step, join_col_value): - child_value = {args.join_column: join_col_value} - for child in item[k]: + def extract_child_value(self, item, current_level, ext, prev_step, prev_step_seq, join_col_value): + k_ = f"{prev_step}{current_level}" + + for seqNo, child in enumerate(item[current_level]): + child_value = { + args.join_column: join_col_value, + f"{prev_step}seq": prev_step_seq, + f"{current_level}_seq": seqNo + 1 + } for (subKey, subValue) in child.items(): is_ext = subKey in ext.get("children", {}).keys() if is_ext: self.extract_child_value(child, subKey, ext["children"][subKey], - f"{prev_step}{k}_", - join_col_value) + f"{prev_step}{current_level}_", + seqNo + 1, join_col_value) else: - child_header = f"{k}_{subKey}" + child_header = f"{current_level}_{subKey}" child_value[child_header] = subValue - - k_ = f"{prev_step}{k}" - dbConn.write_to_database(k_, child_value) + self.write_to_database(k_, child_value) def make_tables(self): + if args.verbose >= 1: + print("making tables...") + for (mhKey, mhVal) in self.table_mapping.items(): + print(f"table {mhKey}, cols {mhVal}") for (mhKey, mhVal) in self.table_mapping.items(): self.make_table(mhKey, mhVal) @@ -209,15 +224,16 @@ class DBConn: children_names = children.keys() + top_level_idx = 0 # second pass, make flat json from original-json, put to sqlite, use pandas to make csv for inp_file in input_files: - if args.debug: + if args.verbose >= 1: print(f"processing file {inp_file}") for topLevelItem in ijson.items(open(input_path / inp_file), "item"): keys = topLevelItem.keys() - flat_json = {} + flat_json = {f"{top_level}_seq": top_level_idx + 1} for key in keys: if key in flat_attributes: for (subKey, subValue) in topLevelItem[key].items(): @@ -225,12 +241,16 @@ class DBConn: flat_json[child_attribute] = subValue elif key in children_names: ext = children[key] - self.extract_child_value(topLevelItem, key, ext, f"{top_level}_", + self.extract_child_value(topLevelItem, + key, + ext, f"{top_level}_", + top_level_idx + 1, topLevelItem[args.join_column]) else: flat_json[key] = topLevelItem[key] self.write_to_database(top_level, flat_json) + top_level_idx += 1 if not args.single: self.con.commit() @@ -248,4 +268,12 @@ class DBConn: dbConn = DBConn() if __name__ == '__main__': + if args.verbose >= 1: + print(f"args = {args}") + + if args.clean: + for d in args.output.glob("*.csv"): + print(f"will delete {d}") + os.unlink(d) + dbConn.parse_json()