Airflow - DAGs

try:

    from datetime import timedelta
    from airflow import DAG
    
    from airflow.operators.python_operator import PythonOperator
    from datetime import datetime
    from pandas.io.json import json_normalize

    import pandas as pd
    import json
    import requests
    import numpy as np

    import psycopg2
    from sqlalchemy import create_engine

except Exception as e:
    print("Error {} ".format(e))

dRoW_api_end_url = "https://uat2.drow.cloud"

def getDrowToken(**context):

    response = requests.post(
    url=f"{dRoW_api_end_url}/api/auth/authenticate",
    data={
    "username": "icwp2@drow.cloud",
    "password": "dGVzdDAxQHRlc3QuY29t"
    }
    ).json()
    context["ti"].xcom_push(key="token", value=response['token'])


def getdrowPSQLConnectionString():
    host                  = 'drowdatewarehouse.crlwwhgepgi7.ap-east-1.rds.amazonaws.com'  

    # User name of the database server
    dbUserName            = 'dRowAdmin'  

    # Password for the database user
    dbUserPassword        = 'drowsuper'  

    # Name of the database 
    database              = 'drowDateWareHouse'

    # Character set
    charSet               = "utf8mb4"  

    port                  = "5432"

    conn_string = ('postgres://' +
                           dbUserName + ':' + 
                           dbUserPassword +
                           '@' + host + ':' + port +
                           '/' + database)
    print(conn_string)
    return conn_string

def getFirstAction(**context):
    token = context.get("ti").xcom_pull(key="token")
    response = requests.get(
    url=f"{dRoW_api_end_url}/api/module/document-export/airflow/workflow/5fe47ab9f964043fba328ca3?export_type=0",
    headers={
    "x-access-token": f"Bearer {token}",
    "ICWPxAccessKey": "nd@201907ICWP_[1AG:4UdI){n=b~"
    }
    )
    RISC_Data = json.loads(response.text)
    Mapping= {
    "A01 Date" : "a01_date", 
    "A02a Section" : "a02_portion_no",
    "A02a District" : "a03_location",
    "A03a - Activity Type" : "a04_activity",
    }

    conn_string = getdrowPSQLConnectionString()
    db = create_engine(conn_string)
    conn = db.connect()
    with conn as conn:
        df = pd.DataFrame()
        _df = pd.DataFrame()
        for x in RISC_Data:
            df_nested_list = json_normalize(x['data'])
            df2 = df_nested_list.reindex(columns=Mapping.keys())

            if x['data']['A01 Date'] and x['data']['A01 Date'] < "2023-01-01":
                continue
            
            df3 = pd.DataFrame()
            if len(x['data']['A06 Labour']) > 0:
                for c in x['data']['A06 Labour']:
                    _df2 = df2.copy()
                    labourName = str(c['A06.2 Trade'])
                    labourNum = 0
                    if ('A06.3 No.' in c) and not c['A06.3 No.'] is None:
                        labourNum = c['A06.3 No.']
                    else:
                        labourNum = 0
                    
                    _df2['labour_type'] = labourName
                    _df2['labour_number'] = labourNum
                    df3 = df3.append(_df2)

                df2 = df2.append(df3)
            df2.rename(columns=Mapping, inplace=True)
            df2.columns = df2.columns.str.replace(' ', '_').str.replace('.', '_').str.replace('(', '_').str.replace(')', '').str.replace('%', 'percent').str.replace('/', '_').str.replace('__', '_')

            df = df.append(df2)
            
            # new 
            df4 = df_nested_list[Mapping.keys()]
            df5 = pd.DataFrame()
            if 'A07 Plant' in x['data'] and len(x['data']['A07 Plant']) > 0:
                __df2 = df4.copy()
                noOfWorking = 0
                noOfIdel = 0

                for c in x['data']['A07 Plant']:
                    equipmentName = str(c['A07.2 Type'])
                    if ('A07.3b No. of Working' in c) and not c['A07.3b No. of Working'] is None:
                        noOfWorking = c['A07.3b No. of Working']
                    else:
                        noOfWorking = 0

                    if ('A07.4b No. of Idle' in c) and not c['A07.4b No. of Idle'] is None:
                        noOfIdel = c['A07.4b No. of Idle']                        
                    else:
                        noOfIdel = 0

                    __df2['plant_name'] = equipmentName
                    __df2['plant_working_number'] = noOfWorking
                    __df2['plant_idle_number'] = noOfIdel
                    df5 = df5.append(__df2)
                df4 = df4.append(df5)
            df4.rename(columns=Mapping, inplace=True)
            df4.columns = df4.columns.str.replace(' ', '_').str.replace('.', '_').str.replace('(', '_').str.replace(')', '').str.replace('%', 'percent').str.replace('/', '_').str.replace('__', '_')

            _df = _df.append(df4)
        df['a01_date']=df['a01_date'].apply(pd.to_datetime)
        df.to_sql('site_diary_activities_labour_dc201911', con=conn, if_exists='replace', index= False)

        _df['a01_date']=_df['a01_date'].apply(pd.to_datetime)
        _df.to_sql('site_diary_activities_equipment_dc201911', con=conn, if_exists='replace', index= False)
    conn.close()
    
def getSecondAction(**context):
    token = context.get("ti").xcom_pull(key="token")
    response = requests.get(
    url=f"{dRoW_api_end_url}/api/module/document-export/airflow/workflow/5fe47ab9f964043fba328ca3?export_type=0",
    headers={
    "x-access-token": f"Bearer {token}",
    "ICWPxAccessKey": "nd@201907ICWP_[1AG:4UdI){n=b~"
    }
    )

    RISC_Data = json.loads(response.text)
    Mapping= {
    "A01 Date" : "a01_date", 
    "A02a Section" : "a02_portion_no",
    "A02a District" : "a03_location",
    "A03a - Activity Type" : "a04_activity",
    }

    conn_string = getdrowPSQLConnectionString()
    db = create_engine(conn_string)
    conn = db.connect()
    with conn as conn:
        df = pd.DataFrame()
        _df = pd.DataFrame()
        for x in RISC_Data:
            df_nested_list = json_normalize(x['data'])
            df2 = df_nested_list.reindex(columns=Mapping.keys())
            
            if x['data']['A01 Date'] and x['data']['A01 Date'] < "2023-01-01":
                continue

            df3 = pd.DataFrame()
            if len(x['data']['A06 Labour']) > 0:
                for c in x['data']['A06 Labour']:
                    _df2 = df2.copy()
                    labourName = str(c['A06.2 Trade'])
                    labourNum = 0
                    if ('A06.3 No.' in c) and not c['A06.3 No.'] is None:
                        labourNum = c['A06.3 No.']
                    else:
                        labourNum = 0
                    
                    _df2['labour_type'] = labourName
                    _df2['labour_number'] = labourNum
                    df3 = df3.append(_df2)

                df2 = df2.append(df3)
            df2.rename(columns=Mapping, inplace=True)
            df2.columns = df2.columns.str.replace(' ', '_').str.replace('.', '_').str.replace('(', '_').str.replace(')', '').str.replace('%', 'percent').str.replace('/', '_').str.replace('__', '_')

            df = df.append(df2)
            
            # new 
            df4 = df_nested_list[Mapping.keys()]
            df5 = pd.DataFrame()
            if 'A07 Plant' in x['data'] and len(x['data']['A07 Plant']) > 0:
                __df2 = df4.copy()
                noOfWorking = 0
                noOfIdel = 0

                for c in x['data']['A07 Plant']:
                    equipmentName = str(c['A07.2 Type'])
                    if ('A07.3b No. of Working' in c) and not c['A07.3b No. of Working'] is None:
                        noOfWorking = c['A07.3b No. of Working']
                    else:
                        noOfWorking = 0

                    if ('A07.4b No. of Idle' in c) and not c['A07.4b No. of Idle'] is None:
                        noOfIdel = c['A07.4b No. of Idle']                        
                    else:
                        noOfIdel = 0

                    __df2['plant_name'] = equipmentName
                    __df2['plant_working_number'] = noOfWorking
                    __df2['plant_Idle_number'] = noOfIdel
                    df5 = df5.append(__df2)
                df4 = df4.append(df5)
            df4.rename(columns=Mapping, inplace=True)
            df4.columns = df4.columns.str.replace(' ', '_').str.replace('.', '_').str.replace('(', '_').str.replace(')', '').str.replace('%', 'percent').str.replace('/', '_').str.replace('__', '_')

            _df = _df.append(df4)
        df['a01_date']=df['a01_date'].apply(pd.to_datetime)
        df.to_sql('site_diary_activities_labour_dc201911', con=conn, if_exists='replace', index= False)

        _df['a01_date']=_df['a01_date'].apply(pd.to_datetime)
        _df.to_sql('site_diary_activities_equipment_dc201911', con=conn, if_exists='replace', index= False)
    conn.close()

def getThirdAction(**context):
    token = context.get("ti").xcom_pull(key="token")
    response = requests.get(
    url=f"{dRoW_api_end_url}/api/module/document-export/airflow/workflow/5fe47ab9f964043fba328ca3?export_type=0",
    headers={
    "x-access-token": f"Bearer {token}",
    "ICWPxAccessKey": "nd@201907ICWP_[1AG:4UdI){n=b~"
    }
    )
    
    #print('got_data')
    RISC_Data = json.loads(response.text)
    
    Mapping= {
    "A01 Date" : "a01_date", 
    "A02a District" : "a03_location",
    }

    conn_string = getdrowPSQLConnectionString()    
    db = create_engine(conn_string)
    conn = db.connect()

    with conn as conn:
        df = pd.DataFrame()
        for x in RISC_Data:
            df_nested_list = json_normalize(x['data'])
            df2 = df_nested_list.reindex(columns=Mapping.keys())
            df2.rename(columns=Mapping, inplace=True)
            df2.columns = df2.columns.str.replace(' ', '_').str.replace('.', '_').str.replace('(', '_').str.replace(')', '').str.replace('%', 'percent').str.replace('/', '_').str.replace('__', '_')
            df = df.append(df2)

        cnt = df.groupby('a01_date').size().rename('Count')
        df = df.drop_duplicates(subset='a01_date').merge(cnt, left_on='a01_date', right_index=True)
        df['a01_date']=df['a01_date'].apply(pd.to_datetime)
        df.to_sql('site_diary_activities_general_count_dc201911', con=conn, if_exists='replace', index= False)

# */2 * * * * Execute every two minute 
with DAG(
        dag_id="dc201911_site_diary_activity",
        schedule_interval="0 15 * * *",
        default_args={
            "owner": "airflow",
            "retries": 1,
            "retry_delay": timedelta(minutes=5),
            "start_date": datetime(2023, 1, 17)
        },
        catchup=False) as f:
    
    getFirstAction = PythonOperator(
        task_id="getFirstAction",
        python_callable=getFirstAction,
        op_kwargs={"name": "Dylan"},
        provide_context=True,
    )
    
    getSecondAction = PythonOperator(
        task_id="getSecondAction",
        python_callable=getSecondAction,
        op_kwargs={"name": "Dylan"},
        provide_context=True,
    )
    
    getThirdAction = PythonOperator(
        task_id="getThirdAction",
        python_callable=getThirdAction,
        op_kwargs={"name": "Dylan"},
        provide_context=True,
    )

    getDrowToken = PythonOperator(
        task_id="getDrowToken",
        python_callable=getDrowToken,
        provide_context=True,
    )

getDrowToken >> getFirstAction >> getSecondAction
DAG: dc201911_site_diary_activity

schedule: 0 15 * * *

dc201911_site_diary_activity