Airflow - DAGs

try:

    from datetime import timedelta
    from airflow import DAG
    
    from airflow.operators.python_operator import PythonOperator
    from airflow.operators.http_operator import SimpleHttpOperator
    from datetime import datetime
    from pandas.io.json import json_normalize
    from airflow.operators.postgres_operator import PostgresOperator

    import pandas as pd
    import json
    import requests
    import numpy as np

    import psycopg2
    from sqlalchemy import create_engine

except Exception as e:
    print("Error {} ".format(e))

dRoW_api_end_url = "https://drow.cloud"

def getDrowToken(**context):
    response = requests.post(
    url=f"{dRoW_api_end_url}/api/auth/authenticate",
    data={
        "username": "icwp2@drow.cloud",
        "password": "dGVzdDAxQHRlc3QuY29t"
    }).json()
    context["ti"].xcom_push(key="token", value=response['token'])


def getdrowPSQLConnectionString():
    host                  = 'drowdatewarehouse.crlwwhgepgi7.ap-east-1.rds.amazonaws.com'  
    # User name of the database server
    dbUserName            = 'dRowAdmin'  
    # Password for the database user
    dbUserPassword        = 'drowsuper'  
    # Name of the database 
    database              = 'drowDateWareHouse'
    # Character set
    charSet               = "utf8mb4"  
    port                  = "5432"
    conn_string = ('postgres://' +
                           dbUserName + ':' + 
                           dbUserPassword +
                           '@' + host + ':' + port +
                           '/' + database)
    return conn_string

def getFirstAction(**context):
    token = context.get("ti").xcom_pull(key="token")
    response = requests.get(
        url=f"{dRoW_api_end_url}/api/module/document-export/airflow/workflow/66f36ce8f5fc3886d9e523dd?export_type=0",
        headers={
            "x-access-token": f"Bearer {token}",
            "ICWPxAccessKey": "nd@201907ICWP_[1AG:4UdI){n=b~"
        }
    )

    RISC_Data = json.loads(response.text)
    Mapping= {
        "A01 Date" : "a01_date", 
        "A02 Portion" : "a02_portion_no",
        "A02 Location" : "a03_location",
        "A03 Activity" : "a04_activity",
    }

    conn_string = getdrowPSQLConnectionString()
    db = create_engine(conn_string)
    conn = db.connect()
    with conn as conn:
        df = pd.DataFrame()
        _df = pd.DataFrame()
        for x in RISC_Data:
            df_nested_list = json_normalize(x['data'])
            #print('process 1')
            df2 = df_nested_list.reindex(columns=Mapping.keys())
            
            df3 = pd.DataFrame()
            if len(x['data']['A05 Labour']) > 0:
                dfs_to_concat = []
                for c in x['data']['A05 Labour']:
                    # df2 = df2.copy()
                    df2_copy = df2.copy()
                    # labourName = str(c['A05.2 Trade'])
                    labourName = str(c.get('A05.2 Trade', ''))
                    # labourNum = 0
                    # if ('A05.3 No.' in c) and not c['A05.3 No.'] is None:
                    #     # labourNum = c['A05.3 No.']
                    #     labourNum = c.get('A05.3 No.')
                    # else:
                    #     labourNum = 0
                    labourNum = c.get('A05.3 No.') if c.get('A05.3 No.') is not None else 0
                    
                    # df2['labour_type'] = labourName
                    # df2['labour_number'] = labourNum
                    # df3 = df3.append(df2) 
                    df2_copy['labour_type'] = labourName
                    df2_copy['labour_number'] = labourNum
                    # df3 = pd.concat([df3,df2])
                    dfs_to_concat.append(df2_copy)

                # df2 = df2.append(df3)
                # df2 = pd.concat([df2,df3])
                df2 = pd.concat(dfs_to_concat, ignore_index=True)
            else:
                df2['labour_type'] = None
                df2['labour_number'] = 0

            df2.rename(columns=Mapping, inplace=True)
            df2.columns = df2.columns.str.replace(' ', '_').str.replace('.', '_').str.replace('(', '_').str.replace(')', '').str.replace('%', 'percent').str.replace('/', '_').str.replace('__', '_')
            # df = df.append(df2)
            df = pd.concat([df,df2])
            
            df4 = df_nested_list[Mapping.keys()]
            df5 = pd.DataFrame()
            # temp fix to solve storage issue
            if x['data']['A01 Date']:
                # date_obj = datetime.fromisoformat(x['data']['A01 Date'].replace("Z", "+00:00"))
                date_obj = pd.to_datetime(x['data']['A01 Date'])
            else:
                date_obj = None
            if len(x['data']['A06 Equipment']) > 0 and date_obj and date_obj.year > 2024:
                _df2 = df4.copy()
                noOfWorking = 0
                noOfIdel = 0
                for c in x['data']['A06 Equipment']:
                    
                    equipmentName = str(c['A06.1 Type'])
                    if ('A06.3 Working No.' in c) and not c['A06.3 Working No.'] is None:
                        noOfWorking = c['A06.3 Working No.']
                    else:
                        noOfWorking = 0

                    if ('A06.4 Idle No' in c) and not c['A06.4 Idle No'] is None:
                        noOfIdel = c['A06.4 Idle No']                        
                    else:
                        noOfIdel = 0

                    _df2['equipment_name'] = equipmentName
                    _df2['equipment_working_number'] = noOfWorking
                    _df2['equipment_Idle_number'] = noOfIdel
                    # df5 = df5.append(__df2)
                    df5 = pd.concat([df5,_df2])
                # df4 = df4.append(df5)
                df4 = pd.concat([df4,df5])
            df4.rename(columns=Mapping, inplace=True)
            df4.columns = df4.columns.str.replace(' ', '_').str.replace('.', '_').str.replace('(', '_').str.replace(')', '').str.replace('%', 'percent').str.replace('/', '_').str.replace('__', '_')
            # _df = _df.append(df4)
            _df = pd.concat([_df,df4])
        df['a01_date']=df['a01_date'].apply(pd.to_datetime)
        df.to_sql('site_diary_activities_labour_hy202308', con=conn, if_exists='replace', index= False)

        _df['a01_date']=_df['a01_date'].apply(pd.to_datetime)
        _df.to_sql('site_diary_activities_equipment_hy202308', con=conn, if_exists='replace', index= False)
    conn.close()

def getSecondAction(**context):
    token = context.get("ti").xcom_pull(key="token")
    response = requests.get(
        url=f"{dRoW_api_end_url}/api/module/document-export/airflow/workflow/66f36ce8f5fc3886d9e523dc?export_type=0",
        headers={
            "x-access-token": f"Bearer {token}",
            "ICWPxAccessKey": "nd@201907ICWP_[1AG:4UdI){n=b~"
        }
    )
    
    RISC_Data = json.loads(response.text)
    Mapping= {
        "A01 Date" : "a01_date", 
        "A02 Location" : "a03_location",
    }
    conn_string = getdrowPSQLConnectionString()    
    db = create_engine(conn_string)
    conn = db.connect()

    with conn as conn:
        df = pd.DataFrame()
        for x in RISC_Data:
            df_nested_list = json_normalize(x['data'])
            df2 = df_nested_list.reindex(columns=Mapping.keys())
            df2.rename(columns=Mapping, inplace=True)
            df2.columns = df2.columns.str.replace(' ', '_').str.replace('.', '_').str.replace('(', '_').str.replace(')', '').str.replace('%', 'percent').str.replace('/', '_').str.replace('__', '_')
            # df = df.append(df2)
            df = pd.concat([df,df2])
        cnt = df.groupby('a01_date').size().rename('Count')
        df = df.drop_duplicates(subset='a01_date').merge(cnt, left_on='a01_date', right_index=True)
        df['a01_date']=df['a01_date'].apply(pd.to_datetime)
        df.to_sql('site_diary_activities_general_count_hy202308', con=conn, if_exists='replace', index= False)

# site_diary_general_contractor_management
def getThirdAction(**context):
    token = context.get("ti").xcom_pull(key="token")
    response = requests.get(
        url=f"{dRoW_api_end_url}/api/module/document-export/airflow/workflow/66f36ce8f5fc3886d9e523dc?export_type=0",
        headers={
            "x-access-token": f"Bearer {token}",
            "ICWPxAccessKey": "nd@201907ICWP_[1AG:4UdI){n=b~"
        }
    )

    RISC_Data = json.loads(response.text)
    Mapping= {
        "A01 Date" : "a01_date"
    }

    conn_string = getdrowPSQLConnectionString()
    db = create_engine(conn_string)
    conn = db.connect()
    with conn as conn:
        df_sd = pd.read_sql(f'SELECT * FROM site_diary_activities_general_count_hy202308;', conn, parse_dates=['a01_date'])
        df = pd.DataFrame()
        _df = pd.DataFrame()
        for x in RISC_Data:
            df_nested_list = json_normalize(x['data'])
            df2 = df_nested_list.reindex(columns=Mapping.keys())
            if x['data']['A01 Date'] is not None:
                date = datetime.strptime(x['data']['A01 Date'], '%Y-%m-%dT%H:%M:%S.%f%z')
            if x['data']['A01 Date'] is None:
                date = datetime.strptime('2022-09-22T00:00:00.000Z', '%Y-%m-%dT%H:%M:%S.%f%z')

            no_of_site_activities = df_sd.query('a01_date == @date')
            if(len(no_of_site_activities['Count']) != 0):
                df2['no_of_site_activities'] = no_of_site_activities['Count'].iloc[0]
            else:
                df2['no_of_site_activities'] = 0

            if (date < datetime.strptime('2022-10-01T00:00:00.000Z', '%Y-%m-%dT%H:%M:%S.%f%z')):
                df2['complete_or_incomplete'] = 'complete'
            else:
                if x['Status'] == 'End Case':
                    df2['complete_or_incomplete'] = 'complete'
                else:
                    df2['complete_or_incomplete'] = 'incomplete'

            if len(x['ApproveLogSummary']) > 0:
                pmd_sign_date = [data for data in x['ApproveLogSummary'] if data.get('statusName')=="D : CRE Confirm"]
                contractor_sign_date = [data for data in x['ApproveLogSummary'] if data.get('statusName')=="C : Site Agent Confirm"]
                supervisor_sign_date = [data for data in x['ApproveLogSummary'] if data.get('statusName')=="B : SIOW Sign"]

            if len(pmd_sign_date) > 0 and ('to' in pmd_sign_date[len(pmd_sign_date)-1]):
                pmd_receive_time = pmd_sign_date[len(pmd_sign_date)-1]['from']
                pmd_sign_time = pmd_sign_date[len(pmd_sign_date)-1]['to']
                df2['pmd_sign_time'] = pmd_sign_date[len(pmd_sign_date)-1]['to']
                df2['Overdue_PMD']= (pmd_sign_time != '' and pmd_receive_time != '' and (datetime.strptime(pmd_sign_time, '%Y-%m-%dT%H:%M:%S.%f%z') -  datetime.strptime(pmd_receive_time, '%Y-%m-%dT%H:%M:%S.%f%z')).days > 7)
                df2['pmd_sign_time'] = datetime.strptime(pmd_sign_time, '%Y-%m-%dT%H:%M:%S.%f%z') + pd.Timedelta(8, unit='h')
            else: 
                df2['pmd_sign_time'] = None
                df2['Overdue_PMD'] = None

            if len(contractor_sign_date) > 0 and ('to' in contractor_sign_date[len(contractor_sign_date)-1]):
                cr_receive_time = contractor_sign_date[len(contractor_sign_date)-1]['from']
                cr_sign_time = contractor_sign_date[len(contractor_sign_date)-1]['to']
                df2['cr_sign_time'] = contractor_sign_date[len(contractor_sign_date)-1]['to']
                df2['Overdue_CR']= (cr_sign_time != '' and cr_receive_time != '' and (datetime.strptime(cr_sign_time, '%Y-%m-%dT%H:%M:%S.%f%z') -  datetime.strptime(cr_receive_time, '%Y-%m-%dT%H:%M:%S.%f%z')).days > 7)
                df2['cr_sign_time'] = datetime.strptime(cr_sign_time, '%Y-%m-%dT%H:%M:%S.%f%z') + pd.Timedelta(8, unit='h')
            else: 
                df2['cr_sign_time'] = None
                df2['Overdue_CR'] = None

            if len(supervisor_sign_date) > 0 and ('to' in supervisor_sign_date[len(supervisor_sign_date)-1]):
                sup_receive_time = supervisor_sign_date[len(supervisor_sign_date)-1]['from']
                sup_sign_time = supervisor_sign_date[len(supervisor_sign_date)-1]['to']
                df2['sup_sign_time'] = supervisor_sign_date[len(supervisor_sign_date)-1]['to']
                df2['Overdue_SUP']= (sup_sign_time != '' and sup_receive_time != '' and (datetime.strptime(sup_sign_time, '%Y-%m-%dT%H:%M:%S.%f%z') -  datetime.strptime(sup_receive_time, '%Y-%m-%dT%H:%M:%S.%f%z')).days > 7)
                df2['sup_sign_time'] = datetime.strptime(sup_sign_time, '%Y-%m-%dT%H:%M:%S.%f%z') + pd.Timedelta(8, unit='h')
            else: 
                df2['sup_sign_time'] = None
                df2['Overdue_SUP'] = None

            df6 = df_nested_list[Mapping.keys()]
            df4 = pd.DataFrame()
            if "A04 Contractor's Management Team" in x['data'] and len(x['data']["A04 Contractor's Management Team"]) > 0:
                _df3 = df6.copy()
                for c in x['data']["A04 Contractor's Management Team"]:
                    labourName = str(c["A04.1 Ctr Post"])
                    labourNum = 0
                    if ("A04.2 Ctr No." in c) and not c["A04.2 Ctr No."] is None:
                        labourNum = c["A04.2 Ctr No."]
                    else:
                        labourNum = 0
                    _df3['contractor_management_post_name'] = labourName
                    _df3['contractor_management_number'] = labourNum
                    df4 = pd.concat([df4, _df3])

            df2.rename(columns=Mapping, inplace=True)
            df4.rename(columns=Mapping, inplace=True)

            df2.columns = df2.columns.str.replace(' ', '_').str.replace('.', '_').str.replace('(', '_').str.replace(')', '').str.replace('%', 'percent').str.replace('/', '_').str.replace('__', '_')
            df4.columns = df4.columns.str.replace(' ', '_').str.replace('.', '_').str.replace('(', '_').str.replace(')', '').str.replace('%', 'percent').str.replace('/', '_').str.replace('__', '_')

            if df2 is not None and not df2.empty:
                # df2 = df2.fillna(pd.NA)         
                df2 = df2.fillna(np.nan)
            if df4 is not None and not df4.empty:
                # df4 = df4.fillna(pd.NA) 
                df4 = df4.fillna(np.nan)
            df = pd.concat([df, df2])
            _df = pd.concat([_df, df4])
            
        df['a01_date']=df['a01_date'].apply(pd.to_datetime)
        df.to_sql('site_diary_general_hy202308', con=conn, if_exists='replace', index= False)

        _df['a01_date']=_df['a01_date'].apply(pd.to_datetime)
        _df.to_sql('site_diary_general_contractor_managementt_hy202308', con=conn, if_exists='replace', index= False)
    conn.close()

# def getForthAction(**context):
#     token = context.get("ti").xcom_pull(key="token")
#     response = requests.get(
#         url=f"{dRoW_api_end_url}/api/module/document-export/airflow/workflow/65efc918df43024bba49af29?export_type=0",
#         headers={
#             "x-access-token": f"Bearer {token}",
#             "ICWPxAccessKey": "nd@201907ICWP_[1AG:4UdI){n=b~"
#         }
#     )
    
#     RISC_Data = json.loads(response.text)
#     Mapping= {
#         "A01 Date" : "a01_date", 
#         "A02 Location" : "a03_location",
#     }
#     conn_string = getdrowPSQLConnectionString()    
#     db = create_engine(conn_string)
#     conn = db.connect()

#     with conn as conn:
#         df = pd.DataFrame()
#         for x in RISC_Data:
#             df_nested_list = json_normalize(x['data'])
#             df2 = df_nested_list.reindex(columns=Mapping.keys())
#             df2.rename(columns=Mapping, inplace=True)
#             df2.columns = df2.columns.str.replace(' ', '_').str.replace('.', '_').str.replace('(', '_').str.replace(')', '').str.replace('%', 'percent').str.replace('/', '_').str.replace('__', '_')
#             df = df.append(df2)
#         cnt = df.groupby('a01_date').size().rename('Count')
#         df = df.drop_duplicates(subset='a01_date').merge(cnt, left_on='a01_date', right_index=True)
#         df['a01_date']=df['a01_date'].apply(pd.to_datetime)
#         df.to_sql('site_diary_activities_general_count_dc202312', con=conn, if_exists='append', index= False)

# */2 * * * * Execute every two minute 
with DAG(
        dag_id="hy202308_site_diary_activity",
        schedule_interval="0 15 * * *",
        default_args={
            "owner": "airflow",
            "retries": 1,
            "retry_delay": timedelta(minutes=5),
            "start_date": datetime(2023, 1, 17)
        },
        catchup=False) as f:
    
    getFirstAction = PythonOperator(
        task_id="getFirstAction",
        python_callable=getFirstAction,
        op_kwargs={"name": "Dylan"},
        provide_context=True,
    )
    
    getSecondAction = PythonOperator(
        task_id="getSecondAction",
        python_callable=getSecondAction,
        op_kwargs={"name": "Dylan"},
        provide_context=True,
    )
    
    getThirdAction = PythonOperator(
        task_id="getThirdAction",
        python_callable=getThirdAction,
        op_kwargs={"name": "Dylan"},
        provide_context=True,
    )

    # getForthAction = PythonOperator(
    #     task_id="getForthAction",
    #     python_callable=getForthAction,
    #     op_kwargs={"name": "Dylan"},
    #     provide_context=True,
    # )

    getDrowToken = PythonOperator(
        task_id="getDrowToken",
        python_callable=getDrowToken,
        provide_context=True,
        # op_kwargs={"name": "Dylan"}
    )

getDrowToken >> getFirstAction >> getSecondAction >> getThirdAction
# getDrowToken >> getFirstAction >> getSecondAction >> getThirdAction >> getForthAction
DAG: hy202308_site_diary_activity

schedule: 0 15 * * *

hy202308_site_diary_activity