Airflow - DAGs

try:
    from datetime import timedelta
    from datetime import datetime
    from airflow import DAG
    from airflow.operators.python_operator import PythonOperator

    import pandas as pd
    import json
    import requests
    from sqlalchemy import create_engine
except Exception as e:
    print("Error {} ".format(e))


dRoW_api_end_url = "https://drow.cloud"

WORKFLOW_ID = "68e2a62c0ad70edbe7ac8f4b"
OCR_WORKFLOW_ID = "698d8f4bc779a898e40bc4e2"
FINAL_STATS_WORKFLOW_ID = "69e1e55f2fe265d9f1a4ac5f"

TARGET_TABLE = "ssh505_payment_statistics"
DEPOSIT_SUMMARY_TABLE = "ssh505_deposit_summary"
DISALLOWED_COST_TABLE = "ssh505_disallowed_cost_summary"

OCR_DOC_BASE_URL = (
    "https://drow.cloud/workflow/"
    "ssh505-payment-digitalization/"
    "nec-payment/"
    "3-scc235-ocr-upload-ip10-onwards/"
    f"{OCR_WORKFLOW_ID}/record/"
)


# =========================================================
# FIELD IDS
# =========================================================

# OCR top-level fields
FIELD_ID_PROJECT_SUMMARY = "68b67714645ce2e48136f7cd"      # 項目摘要
FIELD_ID_CONTRACTOR_SUPPLIER = "68b676b4645ce2e481360c71"  # 承判商/供應商
FIELD_ID_REFERENCE_NO = "66ffa3201da9bff0231357b4"         # Reference No
FIELD_ID_IP_NO = "6539ee7821a498138fe52b1a"                # IP No.

# Optional fields still used by deposit summary
FIELD_ID_GENERAL_WORKS_CODE = "68b6776d645ce2e481378468"   # 總目/工程編號
FIELD_ID_PROJECT_CODE = "6957759828ad1356b023a972"         # 項目編號
FIELD_ID_COMPANY_CODE = "6957759828ad1356b023a971"         # 公司編號
FIELD_ID_APPLICATION_MONTH = "6539f05021a498138fe52b22"    # Application Month

# Table field
FIELD_ID_CURRENT_PAYMENT_TABLE = "66c58ab093540be489f5215e"  # Current Payment application & assessment

# Table sub-field IDs
SUBFIELD_ID_IP_CURRENT = "686cecebfb15912813259bc7"            # IP (Current)
SUBFIELD_ID_SCC_NO_MC_APPLIED = "66c58ab093540be489f5215f"      # SCC No. (MC Applied)
SUBFIELD_ID_FORECAST_MC = "66c58ab093540be489f52160"            # Forecast (MC)
SUBFIELD_ID_PAID_MC = "686cee1b0a4da4e38e87893c"                # Paid (MC)
SUBFIELD_ID_FORECAST_PM = "674d402654ad15ddfb738677"            # Forecast (PM)
SUBFIELD_ID_PAID_PM = "686ceeb8fb159128132a9203"                # Paid (PM)
SUBFIELD_ID_DISALLOWED_COST = "6985ba002ad01e92c164765b"        # Disallowed Cost
SUBFIELD_ID_DEPOSIT_AMOUNT = "69afd166978f0b7a28820909"         # Deposit amount
SUBFIELD_ID_CMDS_COMMENTS = "6985b63bb5672026ce347de4"          # CMD's Comments


# =========================================================
# FIELD ID -> EXPORTED DATA KEY
# export_type=0 returns populated data in record["data"]
# =========================================================
FIELD_ID_TO_DATA_KEY = {
    FIELD_ID_PROJECT_SUMMARY: "項目摘要",
    FIELD_ID_CONTRACTOR_SUPPLIER: "承判商/供應商",
    FIELD_ID_REFERENCE_NO: "Reference No",
    FIELD_ID_IP_NO: "IP No.",
    FIELD_ID_GENERAL_WORKS_CODE: "總目/工程編號",
    FIELD_ID_PROJECT_CODE: "項目編號",
    FIELD_ID_COMPANY_CODE: "公司編號",
    FIELD_ID_APPLICATION_MONTH: "Application Month",
    FIELD_ID_CURRENT_PAYMENT_TABLE: "Current Payment application & assessment",
}

SUBFIELD_ID_TO_DATA_KEY = {
    SUBFIELD_ID_IP_CURRENT: "IP (Current)",
    SUBFIELD_ID_SCC_NO_MC_APPLIED: "SCC No. (MC Applied)",
    SUBFIELD_ID_FORECAST_MC: "Forecast (MC)",
    SUBFIELD_ID_PAID_MC: "Paid (MC)",
    SUBFIELD_ID_FORECAST_PM: "Forecast (PM)",
    SUBFIELD_ID_PAID_PM: "Paid (PM)",
    SUBFIELD_ID_DISALLOWED_COST: "Disallowed Cost",
    SUBFIELD_ID_DEPOSIT_AMOUNT: "Deposit amount",
    SUBFIELD_ID_CMDS_COMMENTS: "CMD's Comments",
}

# Option ID -> label for SCC No. (MC Applied)
SCC_NO_MC_APPLIED_OPTION_ID_TO_LABEL = {
    "69685ac426c57b862d1af2b9": "1(People)",
    "692d3be5d331571311361ceb": "2(Equipment)",
    "692d3be5d331571311361cec": "3(Plant and Materials)",
    "692d3be5d331571311361ced": "4(Subcontractors)",
    "692d3be5d331571311361cee": "5(Charges)",
    "692d3be5d331571311361cef": "8A(Insurances)",
}

# Fallback when export already returns raw value instead of option ObjectId
SCC_NO_MC_APPLIED_VALUE_TO_LABEL = {
    "1": "1(People)",
    "2": "2(Equipment)",
    "3": "3(Plant and Materials)",
    "4": "4(Subcontractors)",
    "5": "5(Charges)",
    "8A": "8A(Insurances)",
}


def build_ocr_doc_url(record_id):
    if not record_id:
        return None
    return f"{OCR_DOC_BASE_URL}{record_id}?numPerPage=100"


def normalize_ip_no(value):
    if value is None:
        return None
    return str(value).strip()


def safe_to_number(value):
    if value is None:
        return None

    if isinstance(value, (int, float)):
        return value

    if isinstance(value, str):
        cleaned = value.strip().replace(",", "")
        if cleaned == "":
            return None
        try:
            if "." in cleaned:
                return float(cleaned)
            return int(cleaned)
        except Exception:
            return value

    return value


def getDrowToken(**context):
    response = requests.post(
        url=f"{dRoW_api_end_url}/api/auth/authenticate",
        data={
            "username": "dylanlam@drow.cloud",
            "password": "dGVzdDAxQHRlc3QuY29t"
        }
    ).json()

    context["ti"].xcom_push(key="token", value=response["token"])


def get_postgres_connection():
    host = "drowdatewarehouse.crlwwhgepgi7.ap-east-1.rds.amazonaws.com"
    dbUserName = "dRowAdmin"
    dbUserPassword = "drowsuper"
    database = "drowDateWareHouse"
    port = "5432"

    conn_string = (
        "postgresql://"
        + dbUserName
        + ":"
        + dbUserPassword
        + "@"
        + host
        + ":"
        + port
        + "/"
        + database
    )

    return create_engine(conn_string)


def get_workflow_records(token, workflow_id):
    response = requests.get(
        url=f"{dRoW_api_end_url}/api/module/document-export/airflow/workflow/{workflow_id}?export_type=0",
        headers={
            "x-access-token": f"Bearer {token}",
        }
    )
    return json.loads(response.text)


def get_data_value_by_field_id(data, field_id):
    key = FIELD_ID_TO_DATA_KEY.get(field_id)
    if not key:
        return None
    return data.get(key)


def get_table_first_row_by_field_id(data, table_field_id):
    key = FIELD_ID_TO_DATA_KEY.get(table_field_id)
    if not key:
        return {}

    table = data.get(key, [])
    if isinstance(table, list) and len(table) > 0 and isinstance(table[0], dict):
        return table[0]
    return {}


def get_subfield_value_by_field_id(row, subfield_id):
    key = SUBFIELD_ID_TO_DATA_KEY.get(subfield_id)
    if not key:
        return None
    return row.get(key)


def build_joined_code(*parts):
    clean_parts = []
    for p in parts:
        if p is None:
            continue
        p_str = str(p).strip()
        if p_str:
            clean_parts.append(p_str)
    return "-".join(clean_parts) if clean_parts else None


def safe_add(a, b):
    a = 0 if a is None else a
    b = 0 if b is None else b
    return a + b


def resolve_scc_no_mc_applied(raw_value):
    if raw_value is None:
        return None

    raw_str = str(raw_value).strip()

    if raw_str in SCC_NO_MC_APPLIED_OPTION_ID_TO_LABEL:
        return SCC_NO_MC_APPLIED_OPTION_ID_TO_LABEL[raw_str]

    if raw_str in SCC_NO_MC_APPLIED_VALUE_TO_LABEL:
        return SCC_NO_MC_APPLIED_VALUE_TO_LABEL[raw_str]

    return raw_str


def is_empty_value(value):
    if value is None:
        return True

    if isinstance(value, str) and value.strip() == "":
        return True

    return False


def extract_output_row(record):
    data = record.get("data", {})

    return {
        "ip_no": normalize_ip_no(data.get("IP No.")),
        "gross_total": data.get("Gross Total"),
        "retention": data.get("Retention"),
        "previous": data.get("Previous"),
        "amount_due": data.get("Amount Due"),
        "scc1_people_amount_cumulative": data.get("SCC1 - People Amount (Cumulative)"),
        "scc2_equipments_amount_cumulative": data.get("SCC2 - Equipments Amount (Cumulative)"),
        "scc3_plant_materials_amount_cumulative": data.get("SCC3 - Plant & Materials Amount (Cumulative)"),
        "scc4_subcontracts_amount_cumulative": data.get("SCC4 - Subcontracts Amount (Cumulative)"),
        "scc5_charges_amount_cumulative": data.get("SCC5 - Charges Amount (Cumulative)"),
        "scc8_8a_insurances_amount_cumulative": data.get("SCC8&8A - Insurances Amount (Cumulative)"),
        "fee_20_of_scc_amount_cumulative": data.get("Fee (20% of SCC) Amount (Cumulative)"),
        # Final stats will now be merged from FINAL_STATS_WORKFLOW_ID
        "final_total_of_the_price_with_pain": None,
        "final_total_of_the_price": None,
    }


def build_final_stats_lookup(final_stats_records):
    lookup = {}

    if not final_stats_records:
        return lookup

    first_record = final_stats_records[0]
    data = first_record.get("data", {})
    final_stats_rows = data.get("Final Stats", [])

    if not isinstance(final_stats_rows, list):
        print("Final Stats field is not a list.")
        return lookup

    for row in final_stats_rows:
        if not isinstance(row, dict):
            continue

        ip_no = normalize_ip_no(row.get("IP no."))
        if not ip_no:
            continue

        lookup[ip_no] = {
            "final_total_of_the_price_with_pain": safe_to_number(
                row.get("Final Total of the Price (with Pain)")
            ),
            "final_total_of_the_price": safe_to_number(
                row.get("Final Total of the Price")
            ),
        }

    return lookup


def extract_deposit_summary_row(record):
    data = record.get("data", {})
    record_id = record.get("_id")

    general_works_code = get_data_value_by_field_id(data, FIELD_ID_GENERAL_WORKS_CODE)
    project_code = get_data_value_by_field_id(data, FIELD_ID_PROJECT_CODE)
    company_code = get_data_value_by_field_id(data, FIELD_ID_COMPANY_CODE)

    combined_code = build_joined_code(
        general_works_code,
        project_code,
        company_code
    )

    first_row = get_table_first_row_by_field_id(data, FIELD_ID_CURRENT_PAYMENT_TABLE)

    ip_current = get_subfield_value_by_field_id(first_row, SUBFIELD_ID_IP_CURRENT)
    scc_no_mc_applied_raw = get_subfield_value_by_field_id(first_row, SUBFIELD_ID_SCC_NO_MC_APPLIED)
    forecast_mc = get_subfield_value_by_field_id(first_row, SUBFIELD_ID_FORECAST_MC)
    paid_mc = get_subfield_value_by_field_id(first_row, SUBFIELD_ID_PAID_MC)
    forecast_pm = get_subfield_value_by_field_id(first_row, SUBFIELD_ID_FORECAST_PM)
    paid_pm = get_subfield_value_by_field_id(first_row, SUBFIELD_ID_PAID_PM)
    deposit_amount = get_subfield_value_by_field_id(first_row, SUBFIELD_ID_DEPOSIT_AMOUNT)

    total_mc = safe_add(forecast_mc, paid_mc)
    total_pm = safe_add(forecast_pm, paid_pm)

    return {
        "doc_url": build_ocr_doc_url(record_id),
        "project_summary": get_data_value_by_field_id(data, FIELD_ID_PROJECT_SUMMARY),
        "contractor_supplier": get_data_value_by_field_id(data, FIELD_ID_CONTRACTOR_SUPPLIER),
        "reference_no": get_data_value_by_field_id(data, FIELD_ID_REFERENCE_NO),
        "combined_code": combined_code,
        "application_month": get_data_value_by_field_id(data, FIELD_ID_APPLICATION_MONTH),
        "ip_no": get_data_value_by_field_id(data, FIELD_ID_IP_NO),
        "ip_current": ip_current,
        "scc_no_mc_applied": resolve_scc_no_mc_applied(scc_no_mc_applied_raw),
        "total_mc": total_mc,
        "total_pm": total_pm,
        "deposit_amount": deposit_amount,
        "cmd_comment": get_subfield_value_by_field_id(first_row, SUBFIELD_ID_CMDS_COMMENTS),
    }


def extract_disallowed_cost_row(record):
    data = record.get("data", {})
    record_id = record.get("_id")
    first_row = get_table_first_row_by_field_id(data, FIELD_ID_CURRENT_PAYMENT_TABLE)

    scc_no_mc_applied_raw = get_subfield_value_by_field_id(first_row, SUBFIELD_ID_SCC_NO_MC_APPLIED)
    forecast_mc = get_subfield_value_by_field_id(first_row, SUBFIELD_ID_FORECAST_MC)
    paid_mc = get_subfield_value_by_field_id(first_row, SUBFIELD_ID_PAID_MC)
    forecast_pm = get_subfield_value_by_field_id(first_row, SUBFIELD_ID_FORECAST_PM)
    paid_pm = get_subfield_value_by_field_id(first_row, SUBFIELD_ID_PAID_PM)
    disallowed_cost = get_subfield_value_by_field_id(first_row, SUBFIELD_ID_DISALLOWED_COST)

    total_mc = safe_add(forecast_mc, paid_mc)
    total_pm = safe_add(forecast_pm, paid_pm)

    return {
        "doc_url": build_ocr_doc_url(record_id),
        "scc_no_mc_applied": resolve_scc_no_mc_applied(scc_no_mc_applied_raw),
        "project_summary": get_data_value_by_field_id(data, FIELD_ID_PROJECT_SUMMARY),
        "contractor_supplier": get_data_value_by_field_id(data, FIELD_ID_CONTRACTOR_SUPPLIER),
        "reference_no": get_data_value_by_field_id(data, FIELD_ID_REFERENCE_NO),
        "ip_no": get_data_value_by_field_id(data, FIELD_ID_IP_NO),
        "ip_current": get_subfield_value_by_field_id(first_row, SUBFIELD_ID_IP_CURRENT),
        "total_mc": total_mc,
        "total_pm": total_pm,
        "disallowed_cost": disallowed_cost,
        "cmd_comment": get_subfield_value_by_field_id(first_row, SUBFIELD_ID_CMDS_COMMENTS),
    }


def inspectFinalStatsWorkflow(**context):
    token = context.get("ti").xcom_pull(key="token")

    records = get_workflow_records(token, FINAL_STATS_WORKFLOW_ID)
    print(f"Total records fetched from final stats workflow: {len(records) if records else 0}")

    if not records:
        print("No records found in final stats workflow.")
        return

    first_record = records[0]
    final_stats_rows = first_record.get("data", {}).get("Final Stats", [])

    print("\n===== FINAL STATS ARRAY (FIRST 5 ROWS) =====")
    print(json.dumps(final_stats_rows[:5], indent=2, default=str))


def getPaymentStatistics(**context):
    token = context.get("ti").xcom_pull(key="token")

    main_records = get_workflow_records(token, WORKFLOW_ID)
    print(f"Total records fetched from main workflow: {len(main_records) if main_records else 0}")

    if not main_records:
        print("No records found in main workflow.")
        return

    final_stats_records = get_workflow_records(token, FINAL_STATS_WORKFLOW_ID)
    print(f"Total records fetched from final stats workflow: {len(final_stats_records) if final_stats_records else 0}")

    final_stats_lookup = build_final_stats_lookup(final_stats_records)
    print(f"Final stats lookup rows prepared: {len(final_stats_lookup)}")

    output_rows = []
    matched_final_stats_count = 0
    unmatched_ip_nos = []

    for record in main_records:
        row = extract_output_row(record)
        ip_no = normalize_ip_no(row.get("ip_no"))

        final_stats = final_stats_lookup.get(ip_no)
        if final_stats:
            row["final_total_of_the_price_with_pain"] = final_stats.get("final_total_of_the_price_with_pain")
            row["final_total_of_the_price"] = final_stats.get("final_total_of_the_price")
            matched_final_stats_count += 1
        else:
            unmatched_ip_nos.append(ip_no)

        output_rows.append(row)

    print("\n===== SAMPLE OUTPUT ROW (PAYMENT STATISTICS) =====")
    print(json.dumps(output_rows[:1], indent=2, default=str))

    print(f"Matched final stats rows: {matched_final_stats_count}")
    print(f"Unmatched IP nos count: {len(unmatched_ip_nos)}")
    if unmatched_ip_nos:
        print("Sample unmatched IP nos:", unmatched_ip_nos[:10])

    df = pd.DataFrame(output_rows)

    engine = get_postgres_connection()

    df.to_sql(
        TARGET_TABLE,
        engine,
        if_exists="replace",
        index=False
    )

    print(f"Inserted {len(df)} rows into {TARGET_TABLE}")


def getDepositSummary(**context):
    token = context.get("ti").xcom_pull(key="token")

    records = get_workflow_records(token, OCR_WORKFLOW_ID)
    print(f"Total OCR records fetched: {len(records) if records else 0}")

    if not records:
        print("No records found in OCR workflow.")
        return

    output_rows = []
    skipped_count = 0

    for record in records:
        row = extract_deposit_summary_row(record)
        if is_empty_value(row.get("deposit_amount")):
            skipped_count += 1
            continue
        output_rows.append(row)

    print(f"Filtered deposit summary rows: kept {len(output_rows)}, skipped {skipped_count}")

    if output_rows:
        print("\n===== SAMPLE DEPOSIT SUMMARY ROW =====")
        print(json.dumps(output_rows[0], indent=2, default=str))
    else:
        print("No deposit summary rows generated.")
        return

    df = pd.DataFrame(output_rows)

    engine = get_postgres_connection()

    df.to_sql(
        DEPOSIT_SUMMARY_TABLE,
        engine,
        if_exists="replace",
        index=False
    )

    print(f"Inserted {len(df)} rows into {DEPOSIT_SUMMARY_TABLE}")


def getDisallowedCostSummary(**context):
    token = context.get("ti").xcom_pull(key="token")

    records = get_workflow_records(token, OCR_WORKFLOW_ID)
    print(f"Total OCR records fetched for disallowed cost: {len(records) if records else 0}")

    if not records:
        print("No records found in OCR workflow.")
        return

    output_rows = []
    skipped_count = 0

    for record in records:
        row = extract_disallowed_cost_row(record)
        if is_empty_value(row.get("disallowed_cost")):
            skipped_count += 1
            continue
        output_rows.append(row)

    print(f"Filtered disallowed cost rows: kept {len(output_rows)}, skipped {skipped_count}")

    if output_rows:
        print("\n===== SAMPLE DISALLOWED COST ROW =====")
        print(json.dumps(output_rows[0], indent=2, default=str))
    else:
        print("No disallowed cost rows generated.")
        return

    df = pd.DataFrame(output_rows)

    engine = get_postgres_connection()

    df.to_sql(
        DISALLOWED_COST_TABLE,
        engine,
        if_exists="replace",
        index=False
    )

    print(f"Inserted {len(df)} rows into {DISALLOWED_COST_TABLE}")


with DAG(
    dag_id="ssh505-scc",
    schedule_interval="0 7,15 * * *",
    default_args={
        "owner": "airflow",
        "retries": 1,
        "retry_delay": timedelta(minutes=5),
        "start_date": datetime(2023, 1, 17),
    },
    catchup=False,
) as f:

    getDrowTokenTask = PythonOperator(
        task_id="getDrowToken",
        python_callable=getDrowToken,
        provide_context=True,
    )

    getPaymentStatisticsTask = PythonOperator(
        task_id="getPaymentStatistics",
        python_callable=getPaymentStatistics,
        provide_context=True,
    )

    getDepositSummaryTask = PythonOperator(
        task_id="getDepositSummary",
        python_callable=getDepositSummary,
        provide_context=True,
    )

    getDisallowedCostSummaryTask = PythonOperator(
        task_id="getDisallowedCostSummary",
        python_callable=getDisallowedCostSummary,
        provide_context=True,
    )

    inspectFinalStatsWorkflowTask = PythonOperator(
        task_id="inspectFinalStatsWorkflow",
        python_callable=inspectFinalStatsWorkflow,
        provide_context=True,
    )

    getDrowTokenTask >> [
        getPaymentStatisticsTask,
        getDepositSummaryTask,
        getDisallowedCostSummaryTask,
        inspectFinalStatsWorkflowTask
    ]
DAG: ssh505-scc ROOT: getPaymentStatistics

schedule: 0 7,15 * * *

ssh505-scc