move to a class based thingy, make schema name specific

2023-06-19 17:31:40 +05:30
parent 184ee74cda
commit 6082809715
1 changed files with 149 additions and 146 deletions
--- a/main.py
+++ b/main.py
@@ -26,14 +26,16 @@ class DBConn:
        self.con = None
        self.counter = 0
        self.ts = ''
        self.reinit_db()
        self.table_col_map = {}
        self.child_tables = {}
        self.init_db()
-    def reinit_db(self):
+    def init_db(self):
        self.counter += 1
        self.ts = time.strftime('%Y%m%d_%H%M%S', time.localtime())
        self.con = sqlite3.connect(args.output / f"data-{args.name}-{self.ts}-{self.counter}.db")
        self.cur = self.con.cursor()
        self.make_tables()
    def make_table(self, tbl_name, cols):
        syspk = "syspk integer primary key autoincrement"
@@ -86,151 +88,152 @@ class DBConn:
            clients = pd.read_sql(f"SELECT * FROM \"{tbl}\"", self.con)
            clients.to_csv(args.output / f"{prefix}{tbl}.csv", index=False, sep=args.delimiter)
    def extract_child(self, items_list, current_level, prev_step, existing, step):
        attributes = existing.get("attributes", [args.join_column])
        flat_attributes = existing.get("flat_attributes", [])
        children = existing.get("children", {})
        for child_idx in range(len(items_list)):
            child = items_list[child_idx]
            for (subKey, subValue) in child.items():
                is_dict = isinstance(subValue, dict)
                is_list = isinstance(subValue, list)
                if is_dict:
                    if subKey not in flat_attributes:
                        flat_attributes.append(subKey)
                    for (sub2Key, sub2Value) in subValue.items():
                        composite_key = f"{current_level}_{sub2Key}"
                        if composite_key not in attributes:
                            attributes.append(composite_key)
                if is_list:
                    if subKey not in children.keys():
                        existing_next_step = children.get(subKey, {})
                        children[subKey] = self.extract_child(subValue, subKey,
                                                              f"{prev_step}{current_level}_",
                                                              existing_next_step, step + 1)
                else:
                    child_header = f"{current_level}_{subKey}"
                    if child_header not in attributes:
                        attributes.append(child_header)
        self.child_tables[f"{prev_step}{current_level}"] = attributes
        return {"flat_attributes": flat_attributes, "children": children, "attributes": attributes}
    def extract_child_value(self, item, k, ext, prev_step, join_col_value):
        child_value = {args.join_column: join_col_value}
        for child in item[k]:
            for (subKey, subValue) in child.items():
                is_ext = subKey in ext.get("children", {}).keys()
                if is_ext:
                    self.extract_child_value(child, subKey, ext["children"][subKey],
                                             f"{prev_step}{k}_",
                                             join_col_value)
                else:
                    child_header = f"{k}_{subKey}"
                    child_value[child_header] = subValue
        k_ = f"{prev_step}{k}"
        dbConn.write_to_database(k_, child_value)
    def make_tables(self):
        for (mhKey, mhVal) in self.child_tables.items():
            self.make_table(mhKey, mhVal)
    def parse_json(self):
        children = {}
        flat_attributes = []
        attributes = []
        input_path = args.input
        input_files = [x.name for x in input_path.iterdir() if
                       not x.is_dir() and x.is_file() and x.name.endswith(".json")]
        print(f"found input file(s) {', '.join(input_files)} in path {args.input}")
        if len(input_files) == 0:
            print("could not find any input files, we shall stop")
            return
        parsed = 0
        top_level = args.name
        # first pass, collect all headers
        print(f"parsing {input_files[0]} for metadata")
        for topLevelItem in ijson.items(open(input_path / input_files[0]), "item"):
            if parsed == args.metadata:
                print(f"parsed {parsed} records for metadata")
                break
            for (key, value) in topLevelItem.items():
                value_is_dict = isinstance(value, dict)
                value_is_list = isinstance(value, list)
                if value_is_dict:
                    if key not in flat_attributes:
                        flat_attributes.append(key)
                    for (subKey, subValue) in value.items():
                        child_attribute = f"{key}_{subKey}"
                        if child_attribute not in attributes:
                            attributes.append(child_attribute)
                elif value_is_list:
                    existing = children.get(key, {})
                    children[key] = self.extract_child(value, key, f"{top_level}_", existing, 1)
                else:
                    if key not in attributes:
                        attributes.append(key)
            parsed += 1
        open(f"schema-{args.name}.json", "w").write(
            json.dumps({
                "attributes": attributes,
                "flat_attributes": flat_attributes,
                "children": children
            }, indent=2)
        )
        self.child_tables[top_level] = attributes
        self.make_tables()
        children_names = children.keys()
        # second pass, make flat json from original-json, put to sqlite, use pandas to make csv
        for inp_file in input_files:
            if args.debug:
                print(f"processing file {inp_file}")
            for topLevelItem in ijson.items(open(input_path / inp_file), "item"):
                keys = topLevelItem.keys()
                flat_json = {}
                for key in keys:
                    if key in flat_attributes:
                        for (subKey, subValue) in topLevelItem[key].items():
                            child_attribute = f"{key}_{subKey}"
                            flat_json[child_attribute] = subValue
                    elif key in children_names:
                        ext = children[key]
                        self.extract_child_value(topLevelItem, key, ext, f"{top_level}_",
                                                 topLevelItem[args.join_column])
                    else:
                        flat_json[key] = topLevelItem[key]
                self.write_to_database(top_level, flat_json)
            if not args.single:
                self.con.commit()
                self.make_csv_from_tables(prefix=f"{pathlib.Path(args.output / inp_file).stem}-")
                self.init_db()
        if args.single:
            self.con.commit()
            self.make_csv_from_tables()
 dbConn = DBConn()
 def extract_child(merge_headers, items_list, top_level, prev_step, existing, step):
    headers = merge_headers.get(top_level, [args.join_column])
    flat_keys = existing.get("flat_keys", [])
    extract_keys = existing.get("extract_keys", {})
    for child_idx in range(len(items_list)):
        child = items_list[child_idx]
        for (subKey, subValue) in child.items():
            is_dict = isinstance(subValue, dict)
            is_list = isinstance(subValue, list)
            if is_dict:
                if subKey not in flat_keys:
                    flat_keys.append(subKey)
                for (sub2Key, sub2Value) in subValue.items():
                    composite_key = f"{top_level}_{sub2Key}"
                    if composite_key not in headers:
                        headers.append(composite_key)
            if is_list:
                if subKey not in extract_keys.keys():
                    existing_next_step = extract_keys.get(subKey, {})
                    extract_keys[subKey] = extract_child(merge_headers, subValue, subKey, f"{prev_step}{top_level}_",
                                                         existing_next_step, step + 1)
            else:
                child_header = f"{top_level}_{subKey}"
                if child_header not in headers:
                    headers.append(child_header)
    merge_headers[f"{prev_step}{top_level}"] = headers
    return {"flat_keys": flat_keys, "extract_keys": extract_keys}
 def extract_child_value(merge_headers, item, k, ext, prev_step, join_col_value):
    child_value = {args.join_column: join_col_value}
    for child in item[k]:
        for (subKey, subValue) in child.items():
            is_ext = subKey in ext.get("extract_keys", {}).keys()
            if is_ext:
                extract_child_value(merge_headers, child, subKey, ext["extract_keys"][subKey], f"{prev_step}{k}_",
                                    join_col_value)
            else:
                child_header = f"{k}_{subKey}"
                child_value[child_header] = subValue
    k_ = f"{prev_step}{k}"
    dbConn.write_to_database(k_, child_value)
 def parse_json():
    extract_keys = {}
    flat_keys = []
    headers = []
    merge_headers = {}
    input_path = args.input
    input_files = [x.name for x in input_path.iterdir() if not x.is_dir() and x.is_file() and x.name.endswith(".json")]
    print(f"found input file(s) {', '.join(input_files)} in path {args.input}")
    if len(input_files) == 0:
        print("could not find any input files, we shall stop")
        return
    parsed = 0
    top_level = args.name
    # first pass, collect all headers
    print(f"parsing {input_files[0]} for metadata")
    for topLevelItem in ijson.items(open(input_path / input_files[0]), "item"):
        if parsed == args.metadata:
            print(f"parsed {parsed} records for metadata")
            break
        for (key, value) in topLevelItem.items():
            value_is_dict = isinstance(value, dict)
            value_is_list = isinstance(value, list)
            if value_is_dict:
                if key not in flat_keys:
                    flat_keys.append(key)
                for (subKey, subValue) in value.items():
                    composite_key = f"{key}_{subKey}"
                    if composite_key not in headers:
                        headers.append(composite_key)
            elif value_is_list:
                existing = extract_keys.get(key, {})
                extract_keys[key] = extract_child(merge_headers, value, key, f"{top_level}_", existing, 1)
            else:
                if key not in headers:
                    headers.append(key)
        parsed += 1
    open("schema.json", "w").write(
        json.dumps({
            "flat_keys": flat_keys,
            "extract_keys": extract_keys
        }, indent=2)
    )
    dbConn.make_table(top_level, headers)
    for (mhKey, mhVal) in merge_headers.items():
        dbConn.make_table(mhKey, mhVal)
    extract_keys_names = extract_keys.keys()
    # second pass, make flat json from original-json, create csv ( we will use sqlite, as it is faster to write)
    for inp_file in input_files:
        for topLevelItem in ijson.items(open(input_path / inp_file), "item"):
            keys = topLevelItem.keys()
            flat_json = {}
            for key in keys:
                if key in flat_keys:
                    for (subKey, subValue) in topLevelItem[key].items():
                        composite_key = f"{key}_{subKey}"
                        flat_json[composite_key] = subValue
                elif key in extract_keys_names:
                    ext = extract_keys[key]
                    extract_child_value(merge_headers, topLevelItem, key, ext, f"{top_level}_",
                                        topLevelItem[args.join_column])
                else:
                    flat_json[key] = topLevelItem[key]
            dbConn.write_to_database(top_level, flat_json)
        if not args.single:
            dbConn.con.commit()
            dbConn.make_csv_from_tables(prefix=f"{pathlib.Path(args.output / inp_file).stem}-")
            dbConn.reinit_db()
            dbConn.make_table(top_level, headers)
            for (mhKey, mhVal) in merge_headers.items():
                dbConn.make_table(mhKey, mhVal)
    if args.single:
        dbConn.con.commit()
        dbConn.make_csv_from_tables()
 if __name__ == '__main__':
-    parse_json()
+    dbConn.parse_json()