import pandas as pd
import os

# Define the base path
base_path = os.path.dirname(os.path.abspath(__file__))

# Define paths for input and output files
productinfo_path = os.path.join(base_path, 'productinfo.csv')
apparel_xlsx_path = os.path.join(base_path, 'apparel_data_with_colors_and_sizes.xlsx')
simple_products_path = os.path.join(base_path, 'simple_products.csv')
parent_products_path = os.path.join(base_path, 'parent_products.xlsx')

# List of encodings to try
encodings = ['utf-8', 'latin1', 'iso-8859-1', 'windows-1252']

# Function to read the CSV file with different encodings
def read_csv_with_encoding(file_path):
    for encoding in encodings:
        try:
            return pd.read_csv(file_path, encoding=encoding)
        except UnicodeDecodeError:
            pass
    raise UnicodeDecodeError("Unable to decode the file with any of the specified encodings.")

# Read the productinfo.csv file
productinfo_df = read_csv_with_encoding(productinfo_path)

# Read the apparel_data_with_colors_and_sizes.xlsx file
apparel_df = pd.read_excel(apparel_xlsx_path, sheet_name='Apparel Sweatshirts Data')

# Exclude all product IDs from productinfo.csv that exist in apparel_data_with_colors_and_sizes.xlsx
excluded_product_ids = apparel_df['PRODUCT']
simple_products_df = productinfo_df[~productinfo_df['PRODUCT'].isin(excluded_product_ids)]

# Save the remaining products to simple_products.csv
simple_products_df.to_csv(simple_products_path, index=False)

# Create a new DataFrame for parent_products.xlsx containing PRODUCT, DESCRIPTION, and Parent_ID
parent_products_df = apparel_df[['PRODUCT', 'DESCRIPTION', 'Parent_ID']].copy()

# Rename columns for consistency
parent_products_df.rename(columns={'DESCRIPTION': 'DESCRIPTION'}, inplace=True)

# Remove duplicates based on Parent_ID, keeping only the first occurrence
parent_products_df = parent_products_df.drop_duplicates(subset=['Parent_ID'], keep='first')

# Save the parent_products DataFrame to an Excel file
parent_products_df.to_excel(parent_products_path, sheet_name='Parent Products', index=False)

print(f"Filtered products saved in {simple_products_path}.")
print(f"Parent products saved in {parent_products_path}.")