import pandas as pd
import glob
import json
import os
import multiprocessing
import concurrent.futures
from joblib import Parallel, delayed
from tqdm import tqdm
import plotly.graph_objects as go
import plotly.express as px
def merge_stats(original, new):
for stat_type, values in new.items():
if stat_type not in original:
original[stat_type] = {}
for key, count in values.items():
if key not in original[stat_type]:
original[stat_type][key] = 0
original[stat_type][key] += count
def glob_in_subdir(subdir_pattern):
return glob.glob(subdir_pattern, recursive=True)
def parallel_glob(base_pattern, n_jobs=-1):
# Split the base pattern into static and wildcard parts
static_part, _, wildcard = base_pattern.rpartition('/')
subdirs = glob.glob(static_part)
# Construct patterns for each subdir
subdir_patterns = [f"{subdir}/{wildcard}" for subdir in subdirs]
# Search in parallel
results = Parallel(n_jobs=n_jobs)(
delayed(glob_in_subdir)(pattern) for pattern in tqdm(subdir_patterns, desc="Processing subdirectories"))
# Flatten the list of lists
all_files = [item for sublist in results for item in sublist]
return all_files
def transform_to_dataframe_format(stats):
rows = []
for file_name, file_stats in stats.items():
for stat_type, values in file_stats.items():
for (category, attribute_value), count in values.items():
row = {
'file_name': file_name,
'stat_type': stat_type,
'category': category,
'attribute_value': attribute_value,
'count': count
return pd.DataFrame(rows)
def process_file(file_path):
with open(file_path, 'r') as f:
record = json.load(f)
except json.JSONDecodeError:
print(f"Failed to decode JSON from file: {file_path}")
return None, None
file_name = record['file'].split('/')[-1].split('_')[0] # Simplified filename extraction
file_stats = {'define': {}, 'status': {}}
for annotation in record['frames']['annotations']:
category = annotation['category']['code']
for attribute in annotation['category']['attributes']:
if 'define' in attribute['code']:
stat_type = 'define'
elif 'status' in attribute['code']:
stat_type = 'status'
key = (category, attribute['value'])
if key not in file_stats[stat_type]:
file_stats[stat_type][key] = 0
file_stats[stat_type][key] += 1
return file_name, file_stats
def generate_and_plot(stats_data, title):
hierarchical_data = []
# Add all file data
for file_name, attributes in stats_data.items():
for (category, value), count in attributes.items():
hierarchical_data.append({"file_name": file_name, "category": category, "value": value, "count": count})
df = pd.DataFrame(hierarchical_data)
# Pivot to create MultiIndex
df_pivot = df.pivot_table(index=['file_name', 'category', 'value'], values='count', aggfunc='sum')
fig = px.sunburst(df, path=['file_name', 'category', 'value'])
fig.update_layout(margin=dict(t=0, b=0, l=0, r=0), title=title)
return fig, df, df_pivot
if __name__ == '__main__':
path = "/media/juni/T7 Shield/DeageonCCTV/인공지능_학습용_도심내_CCTV_영상_데이터셋/**/*.json"
dataset = parallel_glob(path)
# dataset = glob.glob("/media/juni/T7 Shield/DeageonCCTV/인공지능_학습용_도심내_CCTV_영상_데이터셋/21730619/*.json")
print("files are located")
results = Parallel(n_jobs=-1)(delayed(process_file)(file_path) for file_path in tqdm(dataset))
stats = {}
for directory_name, file_stats in results:
if directory_name not in stats:
stats[directory_name] = {}
merge_stats(stats[directory_name], file_stats)
define_stats = {file: data['define'] for file, data in stats.items()}
status_stats = {file: data['status'] for file, data in stats.items()}
def_fig, def_data, df_multi_data = generate_and_plot(define_stats, "Object Definitions")
def_data.to_csv("define_stats.csv", index=False)
df_multi_data.to_csv("define_stats_multi.csv", index=False)
stat_fig, stat_data, df_multi_data = generate_and_plot(status_stats, "Object Status")
stat_data.to_csv("status_stats.csv", index=False)
df_multi_data.to_csv("status_stats.csv_multi", index=False)