File name
Commit message
Commit date
import pandas as pd
import glob
import json
import os
import multiprocessing
import concurrent.futures
from joblib import Parallel, delayed
from tqdm import tqdm
import plotly.graph_objects as go
import plotly.express as px
def merge_stats(original, new):
for stat_type, values in new.items():
if stat_type not in original:
original[stat_type] = {}
for key, count in values.items():
if key not in original[stat_type]:
original[stat_type][key] = 0
original[stat_type][key] += count
def glob_in_subdir(subdir_pattern):
return glob.glob(subdir_pattern, recursive=True)
def parallel_glob(base_pattern, n_jobs=-1):
# Split the base pattern into static and wildcard parts
static_part, _, wildcard = base_pattern.rpartition('/')
subdirs = glob.glob(static_part)
# Construct patterns for each subdir
subdir_patterns = [f"{subdir}/{wildcard}" for subdir in subdirs]
# Search in parallel
results = Parallel(n_jobs=n_jobs)(
delayed(glob_in_subdir)(pattern) for pattern in tqdm(subdir_patterns, desc="Processing subdirectories"))
# Flatten the list of lists
all_files = [item for sublist in results for item in sublist]
return all_files
def transform_to_dataframe_format(stats):
rows = []
for file_name, file_stats in stats.items():
for stat_type, values in file_stats.items():
for (category, attribute_value), count in values.items():
row = {
'file_name': file_name,
'stat_type': stat_type,
'category': category,
'attribute_value': attribute_value,
'count': count
}
rows.append(row)
return pd.DataFrame(rows)
def process_file(file_path):
with open(file_path, 'r') as f:
try:
record = json.load(f)
except json.JSONDecodeError:
print(f"Failed to decode JSON from file: {file_path}")
return None, None
file_name = record['file'].split('/')[-1].split('_')[0] # Simplified filename extraction
file_stats = {'define': {}, 'status': {}}
for annotation in record['frames']['annotations']:
category = annotation['category']['code']
for attribute in annotation['category']['attributes']:
if 'define' in attribute['code']:
stat_type = 'define'
elif 'status' in attribute['code']:
stat_type = 'status'
else:
continue
key = (category, attribute['value'])
if key not in file_stats[stat_type]:
file_stats[stat_type][key] = 0
file_stats[stat_type][key] += 1
return file_name, file_stats
def generate_and_plot(stats_data, title):
hierarchical_data = []
# Add all file data
for file_name, attributes in stats_data.items():
for (category, value), count in attributes.items():
hierarchical_data.append({"file_name": file_name, "category": category, "value": value, "count": count})
df = pd.DataFrame(hierarchical_data)
# Pivot to create MultiIndex
df_pivot = df.pivot_table(index=['file_name', 'category', 'value'], values='count', aggfunc='sum')
fig = px.sunburst(df, path=['file_name', 'category', 'value'])
fig.update_layout(margin=dict(t=0, b=0, l=0, r=0), title=title)
fig.show()
return fig, df, df_pivot
if __name__ == '__main__':
path = "/media/juni/T7 Shield/DeageonCCTV/인공지능_학습용_도심내_CCTV_영상_데이터셋/**/*.json"
dataset = parallel_glob(path)
# dataset = glob.glob("/media/juni/T7 Shield/DeageonCCTV/인공지능_학습용_도심내_CCTV_영상_데이터셋/21730619/*.json")
print("files are located")
results = Parallel(n_jobs=-1)(delayed(process_file)(file_path) for file_path in tqdm(dataset))
stats = {}
for directory_name, file_stats in results:
if directory_name not in stats:
stats[directory_name] = {}
merge_stats(stats[directory_name], file_stats)
print(stats)
define_stats = {file: data['define'] for file, data in stats.items()}
status_stats = {file: data['status'] for file, data in stats.items()}
def_fig, def_data, df_multi_data = generate_and_plot(define_stats, "Object Definitions")
def_data.to_csv("define_stats.csv", index=False)
df_multi_data.to_csv("define_stats_multi.csv", index=False)
stat_fig, stat_data, df_multi_data = generate_and_plot(status_stats, "Object Status")
stat_data.to_csv("status_stats.csv", index=False)
df_multi_data.to_csv("status_stats.csv_multi", index=False)