Airflow - DAGs

try:

    from datetime import timedelta
    from airflow import DAG
    
    from airflow.operators.python_operator import PythonOperator
    from datetime import datetime
    from pandas.io.json import json_normalize

    import pandas as pd
    import json
    import requests
    import numpy as np
    import re

    import psycopg2
    from sqlalchemy import create_engine

except Exception as e:
    print("Error {} ".format(e))

dRoW_api_end_url = "https://drow.cloud"

def getDrowToken(**context):
    response = requests.post(
    url=f"{dRoW_api_end_url}/api/auth/authenticate",
    data={
    "username": "icwp2@drow.cloud",
    "password": "dGVzdDAxQHRlc3QuY29t"
    }
    ).json()
    context["ti"].xcom_push(key="token", value=response['token'])


def getMongoDB(**context):
    token = context.get("ti").xcom_pull(key="token")
    response_s01 = requests.get(
        url=f"{dRoW_api_end_url}/api/module/document-export/airflow/workflow/6597889461a8f490bf96667f?export_type=0",
        headers={
            "x-access-token": f"Bearer {token}",
            "ICWPxAccessKey": "nd@201907ICWP_[1AG:4UdI){n=b~"
        }
    )

    response_s02 = requests.get(
        url=f"{dRoW_api_end_url}/api/module/document-export/airflow/workflow/65ae1f219aad62a7971a07bb?export_type=0",
        headers={
            "x-access-token": f"Bearer {token}",
            "ICWPxAccessKey": "nd@201907ICWP_[1AG:4UdI){n=b~"
        }
    )

    RISC_Data_01 = json.loads(response_s01.text)
    RISC_Data_02 = json.loads(response_s02.text)

    host                  = 'drowdatewarehouse.crlwwhgepgi7.ap-east-1.rds.amazonaws.com'  
    # User name of the database server
    dbUserName            = 'dRowAdmin'  
    # Password for the database user
    dbUserPassword        = 'drowsuper'  
    # Name of the database 
    database              = 'drowDateWareHouse'
    # Character set
    charSet               = "utf8mb4"  

    port                  = "5432"

    # #cursor Type
    # cusrsorType            = pymysql.cursors.DictCursor

    conn_string = ('postgres://' +
                           dbUserName + ':' + 
                           dbUserPassword +
                           '@' + host + ':' + port +
                           '/' + database)
    
    db = create_engine(conn_string)
    conn = db.connect()

    full_df = pd.DataFrame()
    monthly_summary = {}
    with conn:
        for entry in RISC_Data_01:
            df_nested_list = json_normalize(entry['data'])

            # List to hold object for each table
            df_list = []
            # Get total number of tables
            total_tables = len([key for key, val in df_nested_list.items() if 'Table' in key])

            # Inspection date
            date_of_inspection = df_nested_list['Date of Inspection'][0]
            if (date_of_inspection == None):
                continue
            # Contract title
            contract_title = df_nested_list['Contract Title'][0]

            # Process each table dynamically
            for i in range(1, total_tables):
                table_key = f"Table {i}"
                if table_key not in df_nested_list:
                    continue
                
                df_table = df_nested_list[table_key]

                for record in df_table[0]:
                    item_no = list(record.values())[0].split(" ")[0]
                    group_key = list(record.keys())[0]

                    dict_record = {
                        'Date of Inspection': date_of_inspection,
                        'Month': date_of_inspection[:7],
                        'Contract Title': contract_title,
                        'Group No.': str(i),
                        'Group': group_key,
                        'Item No.': item_no,
                        'Description': record[group_key].replace(f"{item_no} ", ""),
                        'Template': 'S01_Daily Site Safety Inspection Checklist',
                    }

                    record.pop(list(record.keys())[0])
                    for k, v in record.items():
                        dict_record[k.replace(f'{i}. ', "")] = v

                    if 'Date completed' not in dict_record or 'Agreed date for completion' not in dict_record:
                        dict_record['On Time'] = None
                    elif not dict_record['Date completed'] or not dict_record['Agreed date for completion']:
                        dict_record['On Time'] = None
                    elif dict_record['Date completed'] <= dict_record['Agreed date for completion']:
                        dict_record['On Time'] = "On-Time"
                    else:
                        dict_record['On Time'] = "Late"

                    df_list.append(dict_record)

                    if date_of_inspection[:7] in monthly_summary:
                        monthly_summary[date_of_inspection[:7]]['items'] += 1
                        if dict_record['Safety Compliance'] == 'No':
                            monthly_summary[date_of_inspection[:7]]['concern'] += 1
                    else:
                        monthly_summary[date_of_inspection[:7]] = {
                            'items': 1,
                            'concern': 1 if dict_record['Safety Compliance'] == 'No' else 0
                        }

            df_combined = pd.DataFrame(data=df_list)

            # Append non-compliant records
            if not full_df.empty and not df_combined.empty:
                full_df = pd.concat([full_df, df_combined], ignore_index=True)
            elif not df_combined.empty:
                full_df = df_combined

        for entry in RISC_Data_02:
            df_nested_list = json_normalize(entry['data'])

            # List to hold object for each table
            df_list = []
            # Get total number of tables
            total_tables = len([key for key, val in df_nested_list.items() if 'Table' in key])

            # Inspection date
            date_of_inspection = df_nested_list['Date of Inspection'][0]
            if (date_of_inspection == None):
                continue
            # Contract title
            contract_title = df_nested_list['Contract Title'][0]

            # Process each table dynamically
            for i in range(1, total_tables):
                table_key = f"Table {i}"
                if table_key not in df_nested_list:
                    continue
                
                df_table = df_nested_list[table_key]

                for record in df_table[0]:
                    item_no = list(record.values())[0].split(" ")[0]
                    group_key = list(record.keys())[0]

                    dict_record = {
                        'Date of Inspection': date_of_inspection,
                        'Month': date_of_inspection[:7],
                        'Contract Title': contract_title,
                        'Group No.': str(i),
                        'Group': group_key,
                        'Item No.': item_no,
                        'Description': record[group_key].replace(f"{item_no} ", ""),
                        'Template': 'S02_Weekly Site Safety Inspection Checklist',
                    }

                    record.pop(list(record.keys())[0])
                    for k, v in record.items():
                        dict_record[k.replace(f'{i}. ', "")] = v

                    if 'Date completed' not in dict_record or 'Agreed date for completion' not in dict_record:
                        dict_record['On Time'] = None
                    elif not dict_record['Date completed'] or not dict_record['Agreed date for completion']:
                        dict_record['On Time'] = None
                    elif dict_record['Date completed'] <= dict_record['Agreed date for completion']:
                        dict_record['On Time'] = "On-Time"
                    else:
                        dict_record['On Time'] = "Late"

                    df_list.append(dict_record)
                    if date_of_inspection[:7] in monthly_summary:
                        monthly_summary[date_of_inspection[:7]]['items'] += 1
                        if dict_record['Safety Compliance'] == 'No':
                            monthly_summary[date_of_inspection[:7]]['concern'] += 1
                    else:
                        monthly_summary[date_of_inspection[:7]] = {
                            'items': 1,
                            'concern': 1 if dict_record['Safety Compliance'] == 'No' else 0
                        }

            df_combined = pd.DataFrame(data=df_list)

            # Append non-compliant records
            if not full_df.empty and not df_combined.empty:
                full_df = pd.concat([full_df, df_combined], ignore_index=True)
            elif not df_combined.empty:
                full_df = df_combined            

        # Sort by date of inspection
        non_compliant_df = full_df[full_df['Safety Compliance'] == 'No']
        non_compliant_df.sort_values(by=['Date of Inspection', 'Contract Title', 'Item No.'], inplace=True)
        # Clean up column names for SQL
        non_compliant_df.columns = non_compliant_df.columns.str.replace(' ', '_').str.replace(r'[().%]', '', regex=True).str.replace('/', '_')

        # Retrieve only relevant columns
        final_df = non_compliant_df[['Date_of_Inspection', 'Month', 'Contract_Title', 'Template', 'Group_No', 'Group', 'Item_No', 'Description', 'Location', 'Safety_Compliance', 'Date_completed', 'Agreed_date_for_completion', 'On_Time']]

        # Write to SQL database
        final_df.to_sql('safety_inspection_dc202312', con=conn, if_exists='replace', index=False)

        # Create a summary df
        summary_dict = []
        for k, v in monthly_summary.items():
            summary_dict.append({
                'Month': k,
                'Items': v['items'],
                'Concerns': v['concern']
            })
        summary_df = pd.DataFrame(data=summary_dict)
        summary_df.to_sql('safety_inspection_summary_dc202312', con=conn, if_exists='replace', index=False)


# */2 * * * * Execute every two minute 
with DAG(
        dag_id="dc202312_safety_inspection",
        schedule_interval="0 15 * * *",
        default_args={
            "owner": "airflow",
            "retries": 1,
            "retry_delay": timedelta(minutes=5),
            "start_date": datetime(2023, 1, 17)
        },
        catchup=False) as f:
    
    getMongoDB = PythonOperator(
        task_id="getMongoDB",
        python_callable=getMongoDB,
        op_kwargs={"name": "Dylan"},
        provide_context=True,
    )

    # reformData = PythonOperator(
    #     task_id="reformData",
    #     python_callable=reformData,
    #     provide_context=True,
    #     # op_kwargs={"name": "Dylan"}
    # )

    getDrowToken = PythonOperator(
        task_id="getDrowToken",
        python_callable=getDrowToken,
        provide_context=True,
        # op_kwargs={"name": "Dylan"}
    )

    # insertData = PythonOperator(
    #     task_id="insetDateToPG",
    #     python_callable=insertData,
    #     provide_context=True,
    #     # op_kwargs={"name": "Dylan"}
    # )

    # create_table = PostgresOperator(
    #     sql = create_table_sql_query,
    #     task_id = "create_table_task",
    #     postgres_conn_id = "postgres_rds",
    # )

    # insert_data = PostgresOperator(
    #     sql = insert_data_sql_query,
    #     task_id = "insertData_sql_query_task",
    #     postgres_conn_id = "postgres_rds",
    # )

getDrowToken >> getMongoDB
DAG: dc202312_safety_inspection

schedule: 0 15 * * *

dc202312_safety_inspection