Airflow - DAGs

try:

    from datetime import timedelta
    from airflow import DAG
    
    from airflow.operators.python_operator import PythonOperator
    from airflow.operators.http_operator import SimpleHttpOperator
    from datetime import datetime
    from pandas.io.json import json_normalize
    from airflow.operators.postgres_operator import PostgresOperator

    import pandas as pd
    import json
    import requests
    import numpy as np

    import psycopg2
    from sqlalchemy import create_engine

except Exception as e:
    print("Error {} ".format(e))

dRoW_api_end_url = "https://drow.cloud"

def getDrowToken(**context):
    response = requests.post(
    url=f"{dRoW_api_end_url}/api/auth/authenticate",
    data={
        "username": "icwp2@drow.cloud",
        "password": "dGVzdDAxQHRlc3QuY29t"
    }).json()
    context["ti"].xcom_push(key="token", value=response['token'])


def getdrowPSQLConnectionString():
    host                  = 'drowdatewarehouse.crlwwhgepgi7.ap-east-1.rds.amazonaws.com'  
    # User name of the database server
    dbUserName            = 'dRowAdmin'  
    # Password for the database user
    dbUserPassword        = 'drowsuper'  
    # Name of the database 
    database              = 'drowDateWareHouse'
    # Character set
    charSet               = "utf8mb4"  
    port                  = "5432"
    conn_string = ('postgres://' +
                           dbUserName + ':' + 
                           dbUserPassword +
                           '@' + host + ':' + port +
                           '/' + database)
    return conn_string

def getFirstAction(**context):
    token = context.get("ti").xcom_pull(key="token")
    response = requests.get(
        url=f"{dRoW_api_end_url}/api/module/document-export/airflow/workflow/657fb9e3fde5354c3627f55a?export_type=0",
        headers={
            "x-access-token": f"Bearer {token}",
            "ICWPxAccessKey": "nd@201907ICWP_[1AG:4UdI){n=b~"
        }
    )

    RISC_Data = json.loads(response.text)
    Mapping= {
        "A01 Date" : "a01_date", 
        "A02 Portion" : "a02_portion_no",
        "A02 Location" : "a03_location",
        "A03 Activity" : "a04_activity",
    }

    conn_string = getdrowPSQLConnectionString()
    db = create_engine(conn_string)
    conn = db.connect()
    with conn as conn:
        df = pd.DataFrame()
        _df = pd.DataFrame()
        for x in RISC_Data:
            df_nested_list = json_normalize(x['data'])
            #print('process 1')
            df2 = df_nested_list.reindex(columns=Mapping.keys())
            
            df3 = pd.DataFrame()
            if len(x['data']['A05 Labour']) > 0:
                for c in x['data']['A05 Labour']:
                    _df2 = df2.copy()
                    labourName = str(c['A05.2 Trade'])
                    labourNum = 0
                    if ('A05.3 No.' in c) and not c['A05.3 No.'] is None:
                        labourNum = c['A05.3 No.']
                    else:
                        labourNum = 0
                    
                    _df2['labour_type'] = labourName
                    _df2['labour_number'] = labourNum
                    df3 = df3.append(_df2)

                df2 = df2.append(df3)
            df2.rename(columns=Mapping, inplace=True)
            df2.columns = df2.columns.str.replace(' ', '_').str.replace('.', '_').str.replace('(', '_').str.replace(')', '').str.replace('%', 'percent').str.replace('/', '_').str.replace('__', '_')
            df = df.append(df2)
            
            # new 
            df4 = df_nested_list[Mapping.keys()]
            df5 = pd.DataFrame()
            # temp fix to solve storage issue
            if x['data']['A01 Date']:
                date_obj = datetime.fromisoformat(x['data']['A01 Date'].replace("Z", "+00:00"))
            else:
                date_obj = None
            if len(x['data']['A06 Equipment']) > 0 and date_obj and date_obj.year > 2024:
                __df2 = df4.copy()
                noOfWorking = 0
                noOfIdel = 0
                for c in x['data']['A06 Equipment']:
                    
                    equipmentName = str(c['A06.1 Type'])
                    if ('A06.3 Working No.' in c) and not c['A06.3 Working No.'] is None:
                        noOfWorking = c['A06.3 Working No.']
                    else:
                        noOfWorking = 0

                    if ('A06.4 Idle No' in c) and not c['A06.4 Idle No'] is None:
                        noOfIdel = c['A06.4 Idle No']                        
                    else:
                        noOfIdel = 0

                    __df2['equipment_name'] = equipmentName
                    __df2['equipment_working_number'] = noOfWorking
                    __df2['equipment_Idle_number'] = noOfIdel
                    df5 = df5.append(__df2)
                df4 = df4.append(df5)
            df4.rename(columns=Mapping, inplace=True)
            df4.columns = df4.columns.str.replace(' ', '_').str.replace('.', '_').str.replace('(', '_').str.replace(')', '').str.replace('%', 'percent').str.replace('/', '_').str.replace('__', '_')
            _df = _df.append(df4)
        df['a01_date']=df['a01_date'].apply(pd.to_datetime)
        df.to_sql('site_diary_activities_labour_dc202312', con=conn, if_exists='replace', index= False)

        _df['a01_date']=_df['a01_date'].apply(pd.to_datetime)
        _df.to_sql('site_diary_activities_equipment_dc202312', con=conn, if_exists='replace', index= False)
    conn.close()

def getSecondAction(**context):
    token = context.get("ti").xcom_pull(key="token")
    response = requests.get(
        url=f"{dRoW_api_end_url}/api/module/document-export/airflow/workflow/657fb9e3fde5354c3627f55a?export_type=0",
        headers={
            "x-access-token": f"Bearer {token}",
            "ICWPxAccessKey": "nd@201907ICWP_[1AG:4UdI){n=b~"
        }
    )
    
    RISC_Data = json.loads(response.text)
    Mapping= {
        "A01 Date" : "a01_date", 
        "A02 Location" : "a03_location",
    }
    conn_string = getdrowPSQLConnectionString()    
    db = create_engine(conn_string)
    conn = db.connect()

    with conn as conn:
        df = pd.DataFrame()
        for x in RISC_Data:
            df_nested_list = json_normalize(x['data'])
            df2 = df_nested_list.reindex(columns=Mapping.keys())
            df2.rename(columns=Mapping, inplace=True)
            df2.columns = df2.columns.str.replace(' ', '_').str.replace('.', '_').str.replace('(', '_').str.replace(')', '').str.replace('%', 'percent').str.replace('/', '_').str.replace('__', '_')
            df = df.append(df2)
        cnt = df.groupby('a01_date').size().rename('Count')
        df = df.drop_duplicates(subset='a01_date').merge(cnt, left_on='a01_date', right_index=True)
        df['a01_date']=df['a01_date'].apply(pd.to_datetime)
        df.to_sql('site_diary_activities_general_count_dc202312', con=conn, if_exists='replace', index= False)

def getThirdAction(**context):
    token = context.get("ti").xcom_pull(key="token")
    response = requests.get(
        url=f"{dRoW_api_end_url}/api/module/document-export/airflow/workflow/65efc918df43024bba49af29?export_type=0",
        headers={
            "x-access-token": f"Bearer {token}",
            "ICWPxAccessKey": "nd@201907ICWP_[1AG:4UdI){n=b~"
        }
    )

    RISC_Data = json.loads(response.text)
    Mapping= {
        "A01 Date" : "a01_date", 
        "A02 Portion" : "a02_portion_no",
        "A02 Location" : "a03_location",
        "A03 Activity" : "a04_activity",
    }

    conn_string = getdrowPSQLConnectionString()
    db = create_engine(conn_string)
    conn = db.connect()
    with conn as conn:
        df = pd.DataFrame()
        _df = pd.DataFrame()
        for x in RISC_Data:
            df_nested_list = json_normalize(x['data'])
            df2 = df_nested_list.reindex(columns=Mapping.keys())
            
            df3 = pd.DataFrame()
            if len(x['data']['A05 Labour']) > 0:
                for c in x['data']['A05 Labour']:
                    _df2 = df2.copy()
                    labourName = str(c['A05.2 Trade'])
                    labourNum = 0
                    if ('A05.3 No.' in c) and not c['A05.3 No.'] is None:
                        labourNum = c['A05.3 No.']
                    else:
                        labourNum = 0
                    
                    _df2['labour_type'] = labourName
                    _df2['labour_number'] = labourNum
                    df3 = df3.append(_df2)

                df2 = df2.append(df3)
            df2.rename(columns=Mapping, inplace=True)
            df2.columns = df2.columns.str.replace(' ', '_').str.replace('.', '_').str.replace('(', '_').str.replace(')', '').str.replace('%', 'percent').str.replace('/', '_').str.replace('__', '_')
            df = df.append(df2)
            
            df4 = df_nested_list[Mapping.keys()]
            df5 = pd.DataFrame()
            # temp fix to solve storage issue
            if x['data']['A01 Date']:
                date_obj = datetime.fromisoformat(x['data']['A01 Date'].replace("Z", "+00:00"))
            else:
                date_obj = None
            if len(x['data']['A06 Equipment']) > 0 and date_obj and date_obj.year > 2024:
                __df2 = df4.copy()
                noOfWorking = 0
                noOfIdel = 0

                for c in x['data']['A06 Equipment']:
                    equipmentName = str(c['A06.1 Type'])
                    if ('A06.3 Working No.' in c) and not c['A06.3 Working No.'] is None:
                        noOfWorking = c['A06.3 Working No.']
                    else:
                        noOfWorking = 0

                    if ('A06.4 Idling No' in c) and not c['A06.4 Idling No'] is None:
                        noOfIdel = c['A06.4 Idling No']                        
                    else:
                        noOfIdel = 0

                    __df2['equipment_name'] = equipmentName
                    __df2['equipment_working_number'] = noOfWorking
                    __df2['equipment_Idle_number'] = noOfIdel
                    df5 = df5.append(__df2)
                df4 = df4.append(df5)
            df4.rename(columns=Mapping, inplace=True)
            df4.columns = df4.columns.str.replace(' ', '_').str.replace('.', '_').str.replace('(', '_').str.replace(')', '').str.replace('%', 'percent').str.replace('/', '_').str.replace('__', '_')
            _df = _df.append(df4)
        df['a01_date']=df['a01_date'].apply(pd.to_datetime)
        df.to_sql('site_diary_activities_labour_dc202312', con=conn, if_exists='append', index= False)

        _df['a01_date']=_df['a01_date'].apply(pd.to_datetime)
        _df.to_sql('site_diary_activities_equipment_dc202312', con=conn, if_exists='append', index= False)
    conn.close()

def getForthAction(**context):
    token = context.get("ti").xcom_pull(key="token")
    response = requests.get(
        url=f"{dRoW_api_end_url}/api/module/document-export/airflow/workflow/65efc918df43024bba49af29?export_type=0",
        headers={
            "x-access-token": f"Bearer {token}",
            "ICWPxAccessKey": "nd@201907ICWP_[1AG:4UdI){n=b~"
        }
    )
    
    RISC_Data = json.loads(response.text)
    Mapping= {
        "A01 Date" : "a01_date", 
        "A02 Location" : "a03_location",
    }
    conn_string = getdrowPSQLConnectionString()    
    db = create_engine(conn_string)
    conn = db.connect()

    with conn as conn:
        df = pd.DataFrame()
        for x in RISC_Data:
            df_nested_list = json_normalize(x['data'])
            df2 = df_nested_list.reindex(columns=Mapping.keys())
            df2.rename(columns=Mapping, inplace=True)
            df2.columns = df2.columns.str.replace(' ', '_').str.replace('.', '_').str.replace('(', '_').str.replace(')', '').str.replace('%', 'percent').str.replace('/', '_').str.replace('__', '_')
            df = df.append(df2)
        cnt = df.groupby('a01_date').size().rename('Count')
        df = df.drop_duplicates(subset='a01_date').merge(cnt, left_on='a01_date', right_index=True)
        df['a01_date']=df['a01_date'].apply(pd.to_datetime)
        df.to_sql('site_diary_activities_general_count_dc202312', con=conn, if_exists='append', index= False)

# */2 * * * * Execute every two minute 
with DAG(
        dag_id="dc202312_site_diary_activity",
        schedule_interval="0 15 * * *",
        default_args={
            "owner": "airflow",
            "retries": 1,
            "retry_delay": timedelta(minutes=5),
            "start_date": datetime(2023, 1, 17)
        },
        catchup=False) as f:
    
    getFirstAction = PythonOperator(
        task_id="getFirstAction",
        python_callable=getFirstAction,
        op_kwargs={"name": "Dylan"},
        provide_context=True,
    )
    
    getSecondAction = PythonOperator(
        task_id="getSecondAction",
        python_callable=getSecondAction,
        op_kwargs={"name": "Dylan"},
        provide_context=True,
    )
    
    getThirdAction = PythonOperator(
        task_id="getThirdAction",
        python_callable=getThirdAction,
        op_kwargs={"name": "Dylan"},
        provide_context=True,
    )

    getForthAction = PythonOperator(
        task_id="getForthAction",
        python_callable=getForthAction,
        op_kwargs={"name": "Dylan"},
        provide_context=True,
    )

    getDrowToken = PythonOperator(
        task_id="getDrowToken",
        python_callable=getDrowToken,
        provide_context=True,
        # op_kwargs={"name": "Dylan"}
    )

# getDrowToken >> getFirstAction >> getSecondAction >> getThirdAction
getDrowToken >> getFirstAction >> getSecondAction >> getThirdAction >> getForthAction
DAG: dc202312_site_diary_activity

schedule: 0 15 * * *

dc202312_site_diary_activity