Merge remote-tracking branch 'origin/master'

add a matching seq no
2023-06-21 17:44:21 +05:30 · 2023-06-21 17:41:56 +05:30
1 changed files with 45 additions and 18 deletions
--- a/main.py
+++ b/main.py
@@ -1,4 +1,5 @@
 import json
+import os
 import pathlib
 import time
 import shutil
@@ -13,8 +14,9 @@ parser.add_argument("--input", help="folder containing input json(s)", required=
 parser.add_argument("--output", help="folder to place csv", required=True, type=pathlib.Path)
 parser.add_argument("--delimiter", help="delimiter for CSV (default is '|'", default="|")
 parser.add_argument("--single", action="store_true", help="merge all json files to single output csv")
-parser.add_argument("--debug", action="store_true", help="print debug logs")
+parser.add_argument("--verbose", '-v', action="count", help="set verbose level", default=0)
 parser.add_argument("--zip", action="store_true", help="make a zipfile of all outputs")
+parser.add_argument("--clean", action="store_true", help="clear output directory")
 parser.add_argument("--metadata", type=int, help="how many records to parse for building metadata", default=1000)
 parser.add_argument("--join-column", help="join column from top-level to merge nested json", required=True)
 parser.add_argument("--name", help="base name, to be used in creating all data", required=True)
@@ -50,7 +52,7 @@ class DBConn:
        syspk = "syspk integer primary key autoincrement"
        other_cols = ', '.join([f"\"{f}\" TEXT" for f in cols])
        create_tbl_sql = f"create table if not exists \"{tbl_name}\" ({syspk}, {other_cols})"
-        if args.debug:
+        if args.verbose >= 1:
            print(f"create sql = {create_tbl_sql}")
        self.cur.execute(create_tbl_sql)

@@ -76,14 +78,18 @@ class DBConn:
                    not_found.append(col)

            if len(not_found) > 0:
+                if args.verbose >= 1:
+                    print(
+                        f"added new cols {', '.join(not_found)} to {tbl}, already present {tbl_cols}, want {col_names}")
+
+                new_cols = list(tbl_cols)
                for new_col in not_found:
                    self.cur.execute(f"alter table \"{tbl}\" add column \"{new_col}\" text")
-
-                print(f"added new cols {', '.join(not_found)} to {tbl}")
-                self.table_mapping[tbl] = list(col_names)
+                    new_cols.append(new_col)
+                self.table_mapping[tbl] = new_cols

        sql = f"insert into \"{tbl}\" ({col_names_placeholder}) values({value_placeholders})"
-        if args.debug:
+        if args.verbose == 2:
            print(f"insert sql = {sql}")
        self.cur.execute(sql, values)

@@ -133,23 +139,31 @@ class DBConn:
        self.table_mapping[f"{prev_step}{current_level}"] = attributes
        return {"flat_attributes": flat_attributes, "children": children, "attributes": attributes}

-    def extract_child_value(self, item, k, ext, prev_step, join_col_value):
-        child_value = {args.join_column: join_col_value}
-        for child in item[k]:
+    def extract_child_value(self, item, current_level, ext, prev_step, prev_step_seq, join_col_value):
+        k_ = f"{prev_step}{current_level}"
+
+        for seqNo, child in enumerate(item[current_level]):
+            child_value = {
+                args.join_column: join_col_value,
+                f"{prev_step}seq": prev_step_seq,
+                f"{current_level}_seq": seqNo + 1
+            }
            for (subKey, subValue) in child.items():
                is_ext = subKey in ext.get("children", {}).keys()
                if is_ext:
                    self.extract_child_value(child, subKey, ext["children"][subKey],
-                                             f"{prev_step}{k}_",
-                                             join_col_value)
+                                             f"{prev_step}{current_level}_",
+                                             seqNo + 1, join_col_value)
                else:
-                    child_header = f"{k}_{subKey}"
+                    child_header = f"{current_level}_{subKey}"
                    child_value[child_header] = subValue
-
-        k_ = f"{prev_step}{k}"
-        dbConn.write_to_database(k_, child_value)
+            self.write_to_database(k_, child_value)

    def make_tables(self):
+        if args.verbose >= 1:
+            print("making tables...")
+            for (mhKey, mhVal) in self.table_mapping.items():
+                print(f"table {mhKey}, cols {mhVal}")
        for (mhKey, mhVal) in self.table_mapping.items():
            self.make_table(mhKey, mhVal)

@@ -211,15 +225,16 @@ class DBConn:

        children_names = children.keys()

+        top_level_idx = 0
        # second pass, make flat json from original-json, put to sqlite, use pandas to make csv
        for inp_file in input_files:

-            if args.debug:
+            if args.verbose >= 1:
                print(f"processing file {inp_file}")

            for topLevelItem in ijson.items(open(input_path / inp_file), "item"):
                keys = topLevelItem.keys()
-                flat_json = {}
+                flat_json = {f"{top_level}_seq": top_level_idx + 1}
                for key in keys:
                    if key in flat_attributes:
                        for (subKey, subValue) in topLevelItem[key].items():
@@ -227,12 +242,16 @@ class DBConn:
                            flat_json[child_attribute] = subValue
                    elif key in children_names:
                        ext = children[key]
-                        self.extract_child_value(topLevelItem, key, ext, f"{top_level}_",
+                        self.extract_child_value(topLevelItem,
+                                                 key,
+                                                 ext, f"{top_level}_",
+                                                 top_level_idx + 1,
                                                 topLevelItem[args.join_column])
                    else:
                        flat_json[key] = topLevelItem[key]

                self.write_to_database(top_level, flat_json)
+                top_level_idx += 1

            if not args.single:
                self.con.commit()
@@ -250,4 +269,12 @@ class DBConn:
 dbConn = DBConn()

 if __name__ == '__main__':
+    if args.verbose >= 1:
+        print(f"args = {args}")
+
+    if args.clean:
+        for d in args.output.glob("*.csv"):
+            print(f"will delete {d}")
+            os.unlink(d)
+
    dbConn.parse_json()
Author	SHA1	Message	Date
gowthaman.b	5ceecc6bf4	Merge remote-tracking branch 'origin/master'	2023-06-21 17:44:21 +05:30
gowthaman.b	72c13c353f	add a matching seq no	2023-06-21 17:41:56 +05:30