import gzip
import json
import os
import time

# Function to read compressed JSON file
def read_gzipped_json(file_path):
    try:
        with gzip.open(file_path, 'rt', encoding='utf-8') as f:
            data = json.load(f)
        return data
    except (OSError, json.JSONDecodeError, gzip.BadGzipFile) as e:
        print(f"Error reading {file_path}: {e}")
        return None

# Create a new structure to store all battery data
data_all_cells = {}

# Assume you have multiple files to process, here is the list of files in the folder
folder_loc = '..\\001_EoL_Data'
file_list = [f for f in os.listdir(folder_loc) if f.endswith('.json.gz')]
nominal_capacity = 4.84
crashed_cell = []
no_raw_data = []
# Process each battery file one by one
start_time = time.time()
for file_name in file_list:
    
    # Read JSON file contents
    file_path = os.path.join(folder_loc, file_name)
    data = read_gzipped_json(file_path)
    
    if data is None:
        print(f"Skipping {file_name} due to read error.")
        crashed_cell.append(file_name)
        continue
    
    cell_data = {}

    # Step 1: Retrieve the protocol and store it into data_all_cells
    protocol = data.get('protocol', 'Unknown')  # If protocol is missing, default to 'Unknown'
    cell_data['protocol'] = protocol

    # Step 2: Retrieve diagnostic_starts_at from structuring_parameters
    diagnostic_starts_at = data.get('structuring_parameters', {}).get('diagnostic_available', {}).get('diagnostic_starts_at', [])
    # cell_data['diagnostic_starts_at'] = diagnostic_starts_at

    # Retrieve relevant data from raw_data
    if 'raw_data' in data:
        raw_data = data.get('raw_data', {})
    else:
        print(f"Skipping {file_name} due to non raw_data.")
        no_raw_data.append(file_name)
        continue
    
    cycle_index = raw_data.get('cycle_index', [])
    test_time = raw_data.get('test_time', [])
    voltage = raw_data.get('voltage', [])
    current = raw_data.get('current', [])
    charge_capacity = raw_data.get('charge_capacity', [])
    discharge_capacity = raw_data.get('discharge_capacity', [])
    
    # Initialize EFC list
    EFC_list = []
    total_capacity_throughput = 0  # Initialize cumulative throughput

    last_cycle_throughput = 0  # Used to accumulate previous cycle's throughput

    # Step 3: Calculate EFC for each cycle
    for idx, cycle in enumerate(cycle_index):
        # Current cycle throughput = current charge_capacity + discharge_capacity
        current_cycle_throughput = charge_capacity[idx] + discharge_capacity[idx]
        
        # Add the previous cycle's throughput
        total_capacity_throughput = last_cycle_throughput + current_cycle_throughput
        
        # Calculate the current EFC
        EFC = total_capacity_throughput / (2 * nominal_capacity)
        EFC_list.append(EFC)
        
        # If this is the last data point in the current cycle, save the throughput for the next cycle
        if idx == len(cycle_index) - 1 or cycle_index[idx + 1] != cycle:
            last_cycle_throughput = total_capacity_throughput
    
    # Define the required cycle type labels
    rpt_labels = ['reset', 'hppc', '02C', '1C', '2C', 'work']

    # Step 4: Loop through diagnostic_starts_at list and adjust test_time
    cell_data['rpt'] = []
    for i, start in enumerate(diagnostic_starts_at):
        # Initialize rpt[i] structure
        rpt_data = {label: {'test_time': [], 'voltage': [], 'current': [], 'charge_capacity': [], 'discharge_capacity': [], 'EFC': []} for label in rpt_labels}
        
        # Loop through each cycle, find the cycles from start to start + 5
        for j, label in enumerate(rpt_labels):
            target_cycle = start + j
            
            # Find all rows in cycle_index that match target_cycle
            matching_indices = [idx for idx, cycle in enumerate(cycle_index) if cycle == target_cycle]
            
            # Extract all matching rows and adjust test_time to start from 0
            if matching_indices:
                first_test_time = test_time[matching_indices[0]]  # Use the first test_time as the baseline
                for idx in matching_indices:
                    adjusted_test_time = test_time[idx] - first_test_time  # Adjust test_time to start from 0
                    rpt_data[label]['test_time'].append(adjusted_test_time)
                    rpt_data[label]['voltage'].append(voltage[idx])
                    rpt_data[label]['current'].append(current[idx])
                    rpt_data[label]['charge_capacity'].append(charge_capacity[idx])
                    rpt_data[label]['discharge_capacity'].append(discharge_capacity[idx])
                    rpt_data[label]['EFC'].append(EFC_list[idx])  # Add EFC for each data point
        
        # Store rpt[i] into cell_data
        cell_data['rpt'].append(rpt_data)

    # Finally, store the data for this battery into data_all_cells
    data_all_cells[file_name] = cell_data
    
    # Calculate and print the time taken to process each file
    elapsed_time = time.time() - start_time
    print(f"Processing file: {file_name}", f"Time taken: {elapsed_time:.2f} seconds")
    del data, cell_data, raw_data, cycle_index,test_time,voltage,current,charge_capacity,discharge_capacity,rpt_data

output_file = 'data_all_cells.json.gz'
with gzip.open(output_file, 'wt', encoding='utf-8') as zipfile:
    json.dump(data_all_cells, zipfile)