Airflow - DAGs

try:

    from datetime import datetime, timezone, timedelta
    from airflow import DAG
    
    from airflow.operators.python_operator import PythonOperator
    from airflow.operators.http_operator import SimpleHttpOperator
    from datetime import datetime
    from pandas.io.json import json_normalize
    from airflow.operators.postgres_operator import PostgresOperator

    import pandas as pd
    import json
    import requests
    import numpy as np

    import psycopg2
    from sqlalchemy import create_engine, text

except Exception as e:
    print("Error {} ".format(e))


dRoW_api_end_url = "https://drow.cloud"

def getDrowToken(**context):
    response = requests.post(
        url=f"{dRoW_api_end_url}/api/auth/authenticate",
        data={
            "username": "icwp2@drow.cloud",
            "password": "dGVzdDAxQHRlc3QuY29t"
        }
    ).json()
    context["ti"].xcom_push(key="token", value=response['token'])

def getdrowPSQLConnectionString():
    host                  = 'drowdatewarehouse.crlwwhgepgi7.ap-east-1.rds.amazonaws.com'  

    # User name of the database server
    dbUserName            = 'dRowAdmin'  

    # Password for the database user
    dbUserPassword        = 'drowsuper'  

    # Name of the database 
    database              = 'drowDateWareHouse'

    # Character set
    charSet               = "utf8mb4"  

    port                  = "5432"

    conn_string = ('postgres://' +
                           dbUserName + ':' + 
                           dbUserPassword +
                           '@' + host + ':' + port +
                           '/' + database)
    return conn_string

def pipelineProcess(**context):
    token = context.get("ti").xcom_pull(key="token")
    
    conn_string = getdrowPSQLConnectionString()

    # Update c5_key_date_data with latest revised completion dates from c5_nec_cas
    db = create_engine(conn_string)
    conn = db.connect()
    with conn as conn:
        try:
            cas_df = pd.read_sql('SELECT * FROM c5_nec_cas', conn)
        except Exception:
            cas_df = pd.read_sql_table('c5_nec_cas', conn)

        # Ensure datetime type for computation
        if 'Revised_Completion_Date' in cas_df.columns:
            cas_df['Revised_Completion_Date'] = pd.to_datetime(cas_df['Revised_Completion_Date'], errors='coerce')

        # Only consider the specified key_Date values
        allowed_keys = [
            'Section 1','Section 2','Section 3A','Section 3B','Section 4','Section 5',
            'Section 6','Section 7','Section 8','Section 9A','Section 9B','Section 9C',
            'Key Date No. 1','Key Date No. 2','Key Date No. 3A','Key Date No. 3B','Key Date No. 4'
        ]
        cas_df = cas_df[cas_df['Key_Date'].isin(allowed_keys)]

        # Compute latest revised date per key date
        latest_by_key = (
            cas_df.dropna(subset=['Key_Date', 'Revised_Completion_Date'])
                 .groupby('Key_Date')['Revised_Completion_Date']
                 .max()
        )

        # Read existing key date table, append new column, and write back
        kd_df = pd.read_sql('SELECT * FROM c5_key_date_data', conn)
        key_col = 'Key_Date' if 'Key_Date' in kd_df.columns else ('key_Date' if 'key_Date' in kd_df.columns else None)
        if key_col is not None:
            kd_df['latest revised'] = kd_df[key_col].map(latest_by_key)
        else:
            print('Warning: No Key_Date column found in c5_key_date_data; skipping latest revised mapping')

        kd_df.to_sql('c5_key_date_data_health_check_report', con=conn, if_exists='replace', index=False)
    conn.close()


    # Build CE/CNCE health check data into a single-row table
    db = create_engine(conn_string)
    conn = db.connect()
    with conn as conn:
        sql = '''
WITH cohorts_dedup AS (
  SELECT "NEC_Event_No",
         CASE
           WHEN BOOL_OR("CE_Status" LIKE 'Quotation to be%') THEN 'A_QuotationToBe'
           ELSE 'B_Other'
         END AS cohort
  FROM public.c5_nec_report_data
  WHERE "NEC_Event_No" LIKE 'CN-%'
    AND "CE_Status" <> ''
  GROUP BY "NEC_Event_No"
),
paired AS (
  SELECT
    c."NEC_Event_No",
    c.cohort,
    MAX(CASE WHEN r."NEC_Doc_Type" LIKE 'PMIQ-%' THEN r."Receive_Date" END) AS pmiq_ts,
    MAX(CASE WHEN r."NEC_Doc_Type" LIKE 'PMN-%'  THEN r."Receive_Date" END) AS pmn_ts
  FROM cohorts_dedup c
  JOIN public.c5_nec_report_data r
    ON r."NEC_Event_No" = c."NEC_Event_No"
   AND r."NEC_Doc_Type" SIMILAR TO '(PMIQ-|PMN-)%'
  GROUP BY c."NEC_Event_No", c.cohort
),
diffs AS (
  SELECT
    cohort,
    "NEC_Event_No",
    EXTRACT(EPOCH FROM (pmiq_ts - pmn_ts)) / 86400.0 AS response_days
  FROM paired
  WHERE pmiq_ts IS NOT NULL
    AND pmn_ts  IS NOT NULL
),
cnce_counts AS (
  SELECT
    COUNT(DISTINCT "NEC_Event_No") FILTER (WHERE "CE_Status" <> '') AS cnce_submitted_by_the_contractor,
    COUNT(DISTINCT "NEC_Event_No") FILTER (WHERE "CE_Status" LIKE 'CE%') AS cnce_accepted_up_to_now
  FROM public.c5_nec_report_data
  WHERE "NEC_Event_No" LIKE 'CN-%'
),
diffs_counts AS (
  SELECT
    COUNT(*) FILTER (WHERE cohort = 'A_QuotationToBe') AS cnce_rejected,
    COUNT(*) FILTER (WHERE cohort = 'B_Other')          AS cnce_outstanding
  FROM diffs
),
ce_counts AS (
  SELECT
    COUNT(DISTINCT "NEC_Event_No") FILTER (WHERE COALESCE("CE_Status", '') <> '') AS ce_all,
    COUNT(DISTINCT CASE WHEN "NEC_Doc_Type" LIKE 'QA%' THEN "Original_Doc_No" END) AS ce_implemented,
    COUNT(DISTINCT CASE WHEN "NEC_Event_No" LIKE 'PM%' AND COALESCE("CE_Status", '') <> '' AND "CE_Status" LIKE 'Quotation to be submitted%' THEN "NEC_Event_No" END) AS ce_pending_for_contractor_quotation,
    COUNT(DISTINCT CASE WHEN "NEC_Event_No" LIKE 'PM%' AND COALESCE("CE_Status", '') <> '' AND "CE_Status" LIKE 'Quotation to be assessed%' THEN "NEC_Event_No" END) AS ce_under_review_by_project_manager,
    COUNT(DISTINCT CASE WHEN "NEC_Event_No" LIKE 'PM%' AND COALESCE("CE_Status", '') <> '' AND ("CE_Status" LIKE 'Quotation to be submitted%' OR "CE_Status" LIKE 'Quotation to be assessed%') THEN "NEC_Event_No" END) AS ce_not_yet_implemented
  FROM public.c5_nec_cas
)
SELECT
  cnce_counts.cnce_submitted_by_the_contractor,
  cnce_counts.cnce_accepted_up_to_now,
  diffs_counts.cnce_rejected,
  diffs_counts.cnce_outstanding,
  ce_counts.ce_all,
  ce_counts.ce_implemented,
  ce_counts.ce_pending_for_contractor_quotation,
  ce_counts.ce_under_review_by_project_manager,
  ce_counts.ce_not_yet_implemented
FROM cnce_counts CROSS JOIN diffs_counts CROSS JOIN ce_counts;
'''
        try:
            result = conn.execute(text(sql))
            try:
                rows = result.mappings().all()  # SQLAlchemy 1.4+
                result_df = pd.DataFrame(rows)
            except Exception:
                rows = result.fetchall()  # Older SQLAlchemy ResultProxy
                columns = result.keys()
                result_df = pd.DataFrame(rows, columns=columns)
            result_df.to_sql('c5_health_check_report_ce_cnce_data', con=conn, if_exists='replace', index=False)
        except Exception as e:
            print('Error building c5_health_check_report_ce_cnce_data:', str(e))
    conn.close()

    # Build merged CE metrics (implemented, due to inclement weather, pending quotation, under review, not yet implemented buckets)
    db = create_engine(conn_string)
    conn = db.connect()
    with conn as conn:
        sql_metrics = '''
WITH base AS (
  SELECT *
  FROM public.c5_nec_cas
),
per_event AS (
  SELECT
    "Original_Doc_No",
    MIN(CASE WHEN "NEC_Doc_Type" LIKE 'PMN%' THEN "Receive_Date" END) AS pmn_date,
    MIN(CASE WHEN "NEC_Doc_Type" LIKE 'QA%'  THEN "Receive_Date" END) AS qa_date
  FROM base
  GROUP BY "Original_Doc_No"
),
ce_impl AS (
  SELECT
    COUNT(*) AS total_events,
    ((AVG(qa_date - pmn_date))::text) AS avg_response_interval_text,
    ROUND(AVG(EXTRACT(EPOCH FROM (qa_date - pmn_date)) / 86400.0)::numeric, 2) AS avg_response_days
  FROM per_event
  WHERE qa_date IS NOT NULL AND pmn_date IS NOT NULL
),
eligible_inclement AS (
  SELECT DISTINCT "Original_Doc_No"
  FROM base
  WHERE "NEC_Clause" = '60.1(13)'
),
ce_inc AS (
  SELECT
    COUNT(*) AS total_events,
    ROUND(AVG(EXTRACT(EPOCH FROM (qa_date - pmn_date)) / 86400.0)::numeric, 2) AS avg_response_days
  FROM per_event
  JOIN eligible_inclement USING ("Original_Doc_No")
  WHERE qa_date IS NOT NULL AND pmn_date IS NOT NULL
),
base_pending AS (
  SELECT "NEC_Event_No", "Receive_Date"
  FROM base
  WHERE COALESCE("CE_Status",'') <> ''
    AND "CE_Status" LIKE 'Quotation to be submitted%'
    AND "Receive_Date" IS NOT NULL
),
per_event_pending AS (
  SELECT "NEC_Event_No", MIN("Receive_Date") AS first_receive_date
  FROM base_pending
  GROUP BY "NEC_Event_No"
),
ce_pending AS (
  SELECT
    COUNT(*) AS total_events,
    ROUND(AVG(EXTRACT(EPOCH FROM (NOW() - first_receive_date)) / 86400.0)::numeric, 2) AS avg_days_since_receive
  FROM per_event_pending
),
base_review AS (
  SELECT "NEC_Event_No", "Receive_Date"
  FROM base
  WHERE COALESCE("CE_Status",'') <> ''
    AND "CE_Status" LIKE 'Quotation to be assessed%'
    AND "Receive_Date" IS NOT NULL
),
per_event_review AS (
  SELECT "NEC_Event_No", MIN("Receive_Date") AS first_receive_date
  FROM base_review
  GROUP BY "NEC_Event_No"
),
ce_review AS (
  SELECT
    COUNT(*) AS total_events,
    ROUND(AVG(EXTRACT(EPOCH FROM (NOW() - first_receive_date)) / 86400.0)::numeric, 2) AS avg_days_since_receive
  FROM per_event_review
),
base_not AS (
  SELECT "NEC_Event_No", "Receive_Date"
  FROM base
  WHERE COALESCE("CE_Status",'') <> ''
    AND ("CE_Status" LIKE 'Quotation to be submitted%' OR "CE_Status" LIKE 'Quotation to be assessed%')
    AND "Receive_Date" IS NOT NULL
),
per_event_not AS (
  SELECT "NEC_Event_No", MIN("Receive_Date") AS first_receive_date
  FROM base_not
  GROUP BY "NEC_Event_No"
),
labeled AS (
  SELECT
    (CURRENT_DATE - first_receive_date::date) AS days_since_receive,
    CASE
      WHEN age(CURRENT_DATE, first_receive_date::date) < interval '3 months'  THEN '< 3 months'
      WHEN age(CURRENT_DATE, first_receive_date::date) < interval '6 months'  THEN '3 - 6 months'
      WHEN age(CURRENT_DATE, first_receive_date::date) < interval '9 months'  THEN '6 - 9 months'
      WHEN age(CURRENT_DATE, first_receive_date::date) < interval '12 months' THEN '9 - 12 months'
      WHEN age(CURRENT_DATE, first_receive_date::date) < interval '24 months' THEN '12 - 24 months'
      ELSE '> 24 months'
    END AS bucket
  FROM per_event_not
),
ce_not_buckets AS (
  SELECT
    COALESCE(bucket, 'TOTAL') AS bucket,
    COUNT(*) AS total_events,
    ROUND(AVG(days_since_receive)::numeric, 2) AS avg_days_since_receive
  FROM labeled
  GROUP BY ROLLUP (bucket)
),
eligible_cnce AS (
  SELECT DISTINCT "NEC_Event_No"
  FROM public.c5_nec_report_data
  WHERE "NEC_Event_No" LIKE 'CN-%'
    AND "CE_Status" <> ''
),
per_event_cnce AS (
  SELECT
    t."NEC_Event_No",
    MIN(CASE WHEN t."NEC_Doc_Type" LIKE 'PMN%' THEN t."Receive_Date" END) AS pmn_date,
    MIN(CASE WHEN t."NEC_Doc_Type" LIKE 'NCE%' THEN t."Receive_Date" END) AS nce_date
  FROM base t
  JOIN eligible_cnce e USING ("NEC_Event_No")
  GROUP BY t."NEC_Event_No"
),
paired_cnce AS (
  SELECT
    "NEC_Event_No",
    pmn_date,
    nce_date,
    EXTRACT(EPOCH FROM (pmn_date - nce_date)) / 86400.0 AS days_diff
  FROM per_event_cnce
  WHERE pmn_date IS NOT NULL
    AND nce_date IS NOT NULL
),
cnce_pmn_diff AS (
  SELECT
    COUNT(*) AS total_events,
    ROUND(AVG(days_diff)::numeric, 2) AS avg_days_nce_minus_pmn
  FROM paired_cnce
)
SELECT 'ce_implemented'::text AS metric,
       NULL::text AS label,
       ce_impl.total_events,
       ce_impl.avg_response_days,
       ce_impl.avg_response_interval_text AS avg_response_interval
FROM ce_impl
UNION ALL
SELECT 'ce_due_inclement_weather'::text AS metric,
       NULL::text AS label,
       ce_inc.total_events,
       ce_inc.avg_response_days,
       NULL::text AS avg_response_interval
FROM ce_inc
UNION ALL
SELECT 'ce_pending_contractors_quotation'::text AS metric,
       NULL::text AS label,
       ce_pending.total_events,
       ce_pending.avg_days_since_receive AS avg_response_days,
       NULL::text AS avg_response_interval
FROM ce_pending
UNION ALL
SELECT 'ce_under_review_by_project_manager'::text AS metric,
       NULL::text AS label,
       ce_review.total_events,
       ce_review.avg_days_since_receive AS avg_response_days,
       NULL::text AS avg_response_interval
FROM ce_review
UNION ALL
SELECT 'ce_not_yet_implemented'::text AS metric,
       ce_not_buckets.bucket AS label,
       ce_not_buckets.total_events,
       ce_not_buckets.avg_days_since_receive AS avg_response_days,
       NULL::text AS avg_response_interval
FROM ce_not_buckets
UNION ALL
SELECT 'cnce_accepted_up_to_now'::text AS metric,
       NULL::text AS label,
       cnce_pmn_diff.total_events,
       cnce_pmn_diff.avg_days_nce_minus_pmn AS avg_response_days,
       NULL::text AS avg_response_interval
FROM cnce_pmn_diff;
'''
        try:
            result = conn.execute(text(sql_metrics))
            try:
                rows = result.mappings().all()
                result_df = pd.DataFrame(rows)
            except Exception:
                rows = result.fetchall()
                columns = result.keys()
                result_df = pd.DataFrame(rows, columns=columns)
            result_df.to_sql('c5_health_check_report_ce_merged_metrics', con=conn, if_exists='replace', index=False)
        except Exception as e:
            print('Error building c5_health_check_report_ce_merged_metrics:', str(e))
    conn.close()

# */2 * * * * Execute every two minute 
with DAG(
        dag_id="c5_nec_report_copy",
        schedule_interval="0 0,4,8,11,16 * * *",
        default_args={
            "owner": "airflow",
            "retries": 1,
            "retry_delay": timedelta(minutes=5),
            "start_date": datetime(2022, 10, 24)
        },
        catchup=False) as f:
    
    pipelineProcess = PythonOperator(
        task_id="pipelineProcess",
        python_callable=pipelineProcess,
        provide_context=True,
    )
        # getWorkflowRecords = PythonOperator(
    #     task_id="getWorkflowRecords",
    #     python_callable=getWorkflowRecords,
    #     provide_context=True,
    # )

    getDrowToken = PythonOperator(
        task_id="getDrowToken",
        python_callable=getDrowToken,
        provide_context=True,
        # op_kwargs={"name": "Dylan"}
    )
    

    # create_table = PostgresOperator(
    #     sql = create_table_sql_query,
    #     task_id = "create_table_task",
    #     postgres_conn_id = "postgres_rds",
    # )

    # insert_data = PostgresOperator(
    #     sql = insert_data_sql_query,
    #     task_id = "insertData_sql_query_task",
    #     postgres_conn_id = "postgres_rds",
    # )

# getDrowToken >> pipelineProcess >> getWorkflowRecords
getDrowToken >> pipelineProcess
DAG: c5_nec_keydate_update

schedule: 0 0,4,8,11,16 * * *

c5_nec_keydate_update