diff --git a/main.py b/main.py index d922013..e8b300a 100644 --- a/main.py +++ b/main.py @@ -12,6 +12,7 @@ parser.add_argument("--input", help="folder containing input json(s)", required= parser.add_argument("--output", help="folder to place csv", required=True, type=pathlib.Path) parser.add_argument("--delimiter", help="delimiter for CSV (default is '|'", default="|") parser.add_argument("--single", action="store_true", help="merge all json files to single output csv") +parser.add_argument("--debug", action="store_true", help="print debug logs") parser.add_argument("--metadata", type=int, help="how many records to parse for building metadata", default=1000) parser.add_argument("--join-column", help="join column from top-level to merge nested json", required=True) parser.add_argument("--name", help="join column from top-level to merge nested json", required=True) @@ -26,6 +27,7 @@ class DBConn: self.counter = 0 self.ts = '' self.reinit_db() + self.table_col_map = {} def reinit_db(self): self.counter += 1 @@ -37,8 +39,10 @@ class DBConn: syspk = "syspk integer primary key autoincrement" other_cols = ', '.join([f"\"{f}\" TEXT" for f in cols]) create_tbl_sql = f"create table if not exists \"{tbl_name}\" ({syspk}, {other_cols})" - print(f"create sql = {create_tbl_sql}") + if args.debug: + print(f"create sql = {create_tbl_sql}") self.cur.execute(create_tbl_sql) + self.table_col_map[tbl_name] = cols def write_to_database(self, tbl, cols): keys = cols.keys() # OrderedDict @@ -52,8 +56,22 @@ class DBConn: # trim values of spaces values = tuple([str(cols[k]).strip() for k in keys]) + # there might be a scenario that new cols might appear at a later stage, if so, + # we shall accommodate by adding it to the database + tbl_cols = self.table_col_map[tbl] + not_found = [] + for col in cols: + if col not in tbl_cols: + not_found.append(col) + + if len(not_found) > 0: + for new_col in not_found: + self.cur.execute(f"alter table \"{tbl}\" add column \"{new_col}\" text") + + print(f"added {not_found} cols to {tbl}") + self.table_col_map[tbl] = cols + sql = f"insert into \"{tbl}\" ({col_names}) values({value_placeholders})" - print(f"sql = {sql}") self.cur.execute(sql, values) def make_csv_from_tables(self, prefix=''): @@ -142,6 +160,7 @@ def parse_json(): top_level = args.name # first pass, collect all headers + print(f"parsing {input_files[0]} for metadata") for topLevelItem in ijson.items(open(input_path / input_files[0]), "item"): if parsed == args.metadata: print(f"parsed {parsed} records for metadata")