Airflow - DAGs

try:

    from datetime import datetime, timezone, timedelta
    from airflow import DAG
    
    from airflow.operators.python_operator import PythonOperator
    from airflow.operators.http_operator import SimpleHttpOperator
    from datetime import datetime
    from pandas.io.json import json_normalize
    from airflow.operators.postgres_operator import PostgresOperator

    import pandas as pd
    import json
    import requests
    import numpy as np

    import psycopg2
    from sqlalchemy import create_engine

except Exception as e:
    print("Error {} ".format(e))

dRoW_api_end_url = "https://drow.cloud"

def getDrowToken(**context):
    response = requests.post(
    url=f"{dRoW_api_end_url}/api/auth/authenticate",
    data={
    "username": "icwp2@drow.cloud",
    "password": "dGVzdDAxQHRlc3QuY29t"
    }
    ).json()
    context["ti"].xcom_push(key="token", value=response['token'])

def getSheetData(token , sheetId):
    response = requests.get(
    url=f"{dRoW_api_end_url}/api/sheets/{sheetId}?with_records=true&fields=",
    headers={
    "x-access-token": f"Bearer {token}",
    }
    )
    sheet = json.loads(response.text)
    headers = sheet['header']
    record = sheet['record']
    dataToExtract=[]
    for d in record: 
        objectToPush = {}
        for v in d['values']:
            for c in headers:
                colNameToExtract = c['colName']
                if v['colName'] == colNameToExtract:
                    # # print(v)
                    if v.get('multValue') != None:
                        if v['multValue'] == True:
                            if v['colType'] == 'Table':
                                tObjectArray = []
                                for t in v['tableValue']:
                                    tObjectToPush = {}
                                    for s in t['subValues']:
                                        tObjectToPush[s['colName']] = s.value
                                    tObjectArray.push(tObjectToPush)
                            else:
                                objectToPush[v['colName']] = v['valueArray']
                        else:
                            if v.get('value') != None:
                                if v.get('value') == 'NA':
                                    objectToPush[v['colName']] = None
                                else:
                                    objectToPush[v['colName']] = v['value']
                            else:
                                objectToPush[v['colName']] = None
                    else:
                        if v.get('value') != None:
                            if v.get('value') == 'NA':
                                objectToPush[v['colName']] = None
                            else:
                                objectToPush[v['colName']] = v['value']
                        else:
                            objectToPush[v['colName']] = None
        dataToExtract.append(objectToPush)
    return dataToExtract

def getWorkflowData(token , workflowId):
    response = requests.get(
    url=f"{dRoW_api_end_url}/api/module/document-export/airflow/workflow/{workflowId}?export_type=0",
    headers={
    "x-access-token": f"Bearer {token}",
    }
    )
    return json.loads(response.text)

def getdrowPSQLConnectionString():
    host                  = 'drowdatewarehouse.crlwwhgepgi7.ap-east-1.rds.amazonaws.com'  

    # User name of the database server
    dbUserName            = 'dRowAdmin'  

    # Password for the database user
    dbUserPassword        = 'drowsuper'  

    # Name of the database 
    database              = 'drowDateWareHouse'

    # Character set
    charSet               = "utf8mb4"  

    port                  = "5432"

    conn_string = ('postgres://' +
                           dbUserName + ':' + 
                           dbUserPassword +
                           '@' + host + ':' + port +
                           '/' + database)
    return conn_string

def pipelineProcess(**context):
    token = context.get("ti").xcom_pull(key="token")
    # Contract Data
    Data = getSheetData(token, "63fd68e49f48080c646e7f32")
    # "Section and key date data"
    Data2 = getSheetData(token, "63fd69ee4fa5210cfa5824db")
    
    conn_string = getdrowPSQLConnectionString()
    db = create_engine(conn_string)
    conn = db.connect()

    df = pd.DataFrame()
    _df = pd.DataFrame()
    with conn as conn:
        for x in Data:
            df_nested_list = json_normalize(x)
            df2 = df_nested_list
            df = df.append(df2)        
        df['starting date']=df['starting date'].apply(pd.to_datetime)
        df['ori comp date']=df['ori comp date'].apply(pd.to_datetime)
        df.columns = df.columns.str.replace(' ', '_').str.replace('.', '').str.replace('(', '_').str.replace(')', '').str.replace('%', 'percent').str.replace('/', '_')
        df.to_sql('c5_nec_section_of_work', con=conn, if_exists='replace', index= False)
        
        for x in Data2:
            df_nested_list = json_normalize(x)
            df2=df_nested_list
            _df = _df.append(df2)
        _df['Starting Date']=_df['Starting Date'].apply(pd.to_datetime)
        _df['Original completion dates']=_df['Original completion dates'].apply(pd.to_datetime)
        _df.columns = _df.columns.str.replace(' ', '_').str.replace('.', '').str.replace('(', '_').str.replace(')', '').str.replace('%', 'percent').str.replace('/', '_')
        _df.to_sql('c5_nec_section_of_work_key_date', con=conn, if_exists='replace', index= False)
    
    #"PWDD and Target Cost with Actual Monthly Total"
    Data = getSheetData(token, "63fef067fc3ac00c7190564c")
    df = pd.DataFrame.from_dict(Data)
    df = df.replace(',', '', regex=True)
    numerics = df.select_dtypes(include="number").columns
    df=df.apply(pd.to_numeric, errors='ignore')
    df[numerics]=df[numerics].apply(lambda x: np.round(x, decimals=5))
    df['IP No.']=df['IP No.'].astype(str)
    df['Month - Year']=df['Month - Year'].apply(pd.to_datetime)
    df.columns = df.columns.str.replace(' ', '_').str.replace('.', '_').str.replace('(', '_').str.replace(')', '').str.replace('%', 'percent')

    db = create_engine(conn_string)
    conn = db.connect()
    with conn as conn:
        df.to_sql('c5_finance_data', con=conn, if_exists='replace')
    conn.close()

    # "Approved and Forecast Price Bar Chart"
    data = getSheetData(token, "63fd68369f48080c646e7bab")
    df = pd.DataFrame.from_dict(data)
    # df = df.tail(1)
    numerics = df.select_dtypes(include="number").columns
    df=df.apply(pd.to_numeric, errors='ignore')
    df[numerics]=df[numerics].apply(lambda x: np.round(x, decimals=5))
    df['Month - Year']=df['Month - Year'].apply(pd.to_datetime)
    df.columns = df.columns.str.replace(' ', '_').str.replace('.', '_').str.replace('(', '_').str.replace(')', '').str.replace('%', 'percent')
    db = create_engine(conn_string)
    conn = db.connect()
    with conn as conn:
        df.to_sql('c5_finance_status_data', con=conn, if_exists='replace')
    conn.close()

    # EOT DATA
    data = getSheetData(token, "63fc9ef84243400ca9af7c70")
    df = pd.DataFrame.from_dict(data)
    numerics = df.select_dtypes(include="number").columns
    df=df.apply(pd.to_numeric, errors='ignore')
    df[numerics]=df[numerics].apply(lambda x: np.round(x, decimals=5))
    df['Month - Year']=df['Month - Year'].apply(pd.to_datetime)
    df.columns = df.columns.str.replace(' ', '_').str.replace('.', '_').str.replace('(', '_').str.replace(')', '').str.replace('%', 'percent')
    db = create_engine(conn_string)
    conn = db.connect()
    with conn as conn:
        df.to_sql('c5_eot_data', con=conn, if_exists='replace')
    conn.close()

    # "Programme Data"
    data= getSheetData(token, "63fd698ed6779f0c607a677c")
    if data:
        df = pd.DataFrame.from_dict(data)
        df['Submission Date']=df['Submission Date'].apply(lambda row : datetime.strptime(row[0:24], '%a %b %d %Y %H:%M:%S'))
        df['Acceptance Date']=df['Acceptance Date'].apply(lambda row : datetime.strptime(row[0:24], '%a %b %d %Y %H:%M:%S'))
        df.columns = df.columns.str.replace(' ', '_').str.replace('.', '_').str.replace('(', '_').str.replace(')', '').str.replace('%', 'percent')
        db = create_engine(conn_string)
        conn = db.connect()
        with conn as conn:
            df.to_sql('c5_programme_data', con=conn, if_exists='replace')
        conn.close()

    #"key date Planned Completion Date (PCD)"
    data = getSheetData(token, "63fd6a2b86bb350c6318686a")
    df = pd.DataFrame.from_dict(data)
    df['Planned Completion Date(PCD)']=df['Planned Completion Date(PCD)'].apply(lambda row: row.split(' (')[0])

    df['Planned Completion Date(PCD)'] = df['Planned Completion Date(PCD)'].apply(lambda row: datetime.strptime(row[0:24], '%a %b %d %Y %H:%M:%S') if len(row) == 19 else datetime.strptime(row, '%a %b %d %Y %H:%M:%S GMT%z'))
    df.columns = df.columns.str.replace(' ', '_').str.replace('.', '_').str.replace('(', '_').str.replace(')', '').str.replace('%', 'percent')
    db = create_engine(conn_string)
    conn = db.connect()
    with conn as conn:
        df.to_sql('c5_key_date_data', con=conn, if_exists='replace')
    conn.close()
    
    # CAS
    Data = getWorkflowData(token, "637c7d22b38f8ca02f5c49ab")
    Mapping= {
            "Original Doc No.": "Original_Doc_No",
            "NEC Doc Type": "NEC_Doc_Type",
            "NEC Event No.": "NEC_Event_No",
            "Doc Ver.": "Doc_Ver",
            "Doc Date": "Doc_Date",
            "Subject": "Subject",
            "From": "From",
            "To": "To",
            "CE Amount": "CE_PMI_Amount",
            "CE Increase / Decrease": "CE_Increase_Decrease",
            "Quotation Status": "Quotation_Status",
            "NEC Clause": "NEC_Clause",
            "Receive Date": "Receive_Date"
    }

    conn_string = getdrowPSQLConnectionString()
    #  #create_engine('mysql+mysqldb://root:password@localhost:3306/mydbname', echo = False)
    # conn_string = ('postgres://' +
    #                        dbUserName + ':' + 
    #                        dbUserPassword +
    #                        '@' + host + ':' + port +
    #                        '/' + database)

    # # df = context.get("ti").xcom_pull(key="InsertData")
    # # print(df)
    # # conn_string = 'postgres://user:password@host/data1'
    
    # db = create_engine(conn_string)
    conn = db.connect()
    # print('db connected')
    df = pd.DataFrame()
    i=0
    with conn as conn:
        for x in Data:
            try:
                if len(x['data'].keys()) == 0:
                    continue
                df_nested_list = json_normalize(x['data'])
                # print('process 1')
                # print(x['data'].keys())
                df2 = df_nested_list.reindex(columns=Mapping.keys())
                df2['record_status'] = x['Status']
                df2['NEC Doc Title']=x['data']['NEC Doc Type']+x['data']['NEC Event No.']
                # This line can remain as is
                df2['Doc Org Ver'] = x['data'].get('Doc Ver.')

                # Use .get() to safely access 'Receive Date'. It returns None if the key doesn't exist.
                original_receive_date = x['data'].get('Receive Date')

                # Check if the original 'Receive Date' is considered empty/null.
                # The check for pandas.isna is good for handling NaN values if the data comes from a DataFrame.
                if original_receive_date in (None, '') or (isinstance(original_receive_date, float) and pd.isna(original_receive_date)):
                    df2['withReceiveDate'] = False

                    # Initialize a variable to hold the date from the logs.
                    receipt_by_contractor_date = None

                    # Safely check if 'ApproveLogSummary' exists and is a list before looping.
                    if 'ApproveLogSummary' in x['data'] and isinstance(x['data']['ApproveLogSummary'], list):
                        # Loop through each summary log to find the one we need.
                        for log_summary in x['data']['ApproveLogSummary']:
                            # Check if the statusName matches our target.
                            if log_summary.get('statusName') == "Receipt by Contractor":
                                # If it matches, get the 'from' date and stop looping.
                                receipt_by_contractor_date = log_summary.get('from')
                                break

                    # After the loop, decide which date to use.
                    if receipt_by_contractor_date:
                        # If we successfully found the date in the logs, use it.
                        df2['Receive Date'] = receipt_by_contractor_date
                    else:
                        # If the 'Receipt by Contractor' status was not found, use the fallback 'Doc Date'.
                        df2['Receive Date'] = x['data'].get('Doc Date')
                else:
                    # If the original 'Receive Date' has a valid value, use it.
                    df2['withReceiveDate'] = True
                    df2['Receive Date'] = original_receive_date
                y=0
                if x['data']['Doc Ver.'] == None:
                    df2['Doc Ver.'] = y
                elif x['data']['Doc Ver.'].startswith('Rev. '):
                    y = x['data']['Doc Ver.'].replace('Rev. ', '')
                    y = int(y)
                else:
                    y = x['data']['Doc Ver.'].replace('-', '').replace('r','')
                    if y!='' and not y.isnumeric():
                        last_letter = y[-1]
                        # print('ver',y)
                        y = int(ord(last_letter)) - int(ord('A')) + 1
                        # print(y)
                    elif y =='':
                        y = 0
                    else :
                        y = int(y)
                    df2['Doc Ver.'] = y
                if (not df2['NEC Doc Title'].empty and 'NEC Doc Title' in df.columns):
                    # print (y)
                    # print('NEC Doc Title' in df.columns)
                    check_ver_df = df.loc[(df['NEC Doc Title'] == x['data']['NEC Doc Type']+x['data']['NEC Event No.'])]
                    if check_ver_df.empty:
                        df2['is_latest'] = 'Yes'
                    else :
                        check_ver_df2 = check_ver_df.loc[(check_ver_df['Doc Ver.'] > y)]
                        if not check_ver_df2.empty:
                            df2['is_latest'] = "No"
                        else:
                            df.loc[(df['NEC Doc Title'] == x['data']['NEC Doc Type']+x['data']['NEC Event No.']) & ( df['Doc Ver.']<y), 'is_latest'] = 'No'
                            df2['is_latest'] = 'Yes'
                else:
                    df2['is_latest'] = 'Yes'
                df2['NEC Doc Title With Version']=x['data']['NEC Doc Type']+x['data']['NEC Event No.']+'-'+str(y)

                # Handle Subject
                df2['Subject'] = x['data']['Subject'].replace("'","")
                
                if (x['data'].get('NEC Doc Type') or '').strip().upper() == 'PMN-' and y==0 and (x['Status'] == 'Receipt by Contractor' or x['Status'] == 'Closed'):
                    df2['From_Status'] = '1. CE notified'
                elif (x['data'].get('NEC Doc Type') or '').strip().upper() == 'CSQ-' and y==0:
                    df2['From_Status'] = '2. Quotation Submitted'
                elif (x['data'].get('NEC Doc Type') or '').strip().upper() == 'QA-'  and (x['Status'] == 'Receipt by Contractor' or x['Status'] == 'Closed'):
                    df2['From_Status'] = '3. CE implemented'
                else:
                    df2['From_Status'] = None

                if len(x['data']['Change to Time'])>0 and x['data']['NEC Doc Type']!='EW-':
                    df4=pd.DataFrame()
                    for change_to_time_table in x['data']['Change to Time']:
                        df3=df2.copy()
                        i = i+1
                        if 'Key Date' in change_to_time_table:
                            df3['Key Date'] = change_to_time_table['Key Date']
                        if 'Extension in days' in change_to_time_table:
                            df3['Extension in days'] = change_to_time_table['Extension in days']
                        if 'Ori Completion Date' in change_to_time_table:
                            df3['Ori Completion Date'] = change_to_time_table['Ori Completion Date']
                        if 'Revised Completion Date' in change_to_time_table:
                            df3['Revised Completion Date'] = change_to_time_table['Revised Completion Date']
                        if i >0:
                            df2['From_Status'] = None
                        df4 = df4.append(df3)
                    i = 0
                    df2 = df2.iloc[0:0]
                    df2=df2.append(df4)
                    # print('process 2')
                    # print('loading into DB')
                df = df.append(df2)
            except:
                continue
        # df['is_latest'].fillna('No',inplace=True)
        df.rename(columns=Mapping, inplace=True)
        fields_to_adjust = ['Doc_Date', 'Ori Completion Date', 'Revised Completion Date', 'Receive_Date']

        for field in fields_to_adjust:
            if field in df.columns:
                df[field] = df[field].apply(pd.to_datetime)
                df[field] = df[field] - pd.Timedelta(hours=8)
        # df['Doc_Date']=df['Doc_Date'].apply(pd.to_datetime)
        # df['Doc_Date'] = df['Doc_Date'] - pd.Timedelta(hours=8)
        # df['Ori Completion Date']=df['Ori Completion Date'].apply(pd.to_datetime)
        # df['Ori Completion Date'] = df['Ori Completion Date'] - pd.Timedelta(hours=8)
        # df['Revised Completion Date']=df['Revised Completion Date'].apply(pd.to_datetime)
        # df['Revised Completion Date'] = df['Revised Completion Date'] - pd.Timedelta(hours=8)
        df.columns = df.columns.str.replace(' ', '_').str.replace('.', '').str.replace('(', '_').str.replace(')', '').str.replace('%', 'percent').str.replace('/', '_')
        
        def handle_quotation_status(row, df):
            # print(row)
            if row['Quotation_Status'] == 'Quotation to be submitted':
                # Filter the DataFrame for the same event and specific document type
                same_event_df = df[(df['NEC_Event_No'] == row['NEC_Event_No']) & (df['NEC_Doc_Type'] == 'CSQ-')]

                # Check if the DataFrame is not empty
                if not same_event_df.empty:
                    # Get the latest document
                    latest_pmn = same_event_df.sort_values(by='Receive_Date', ascending=False).iloc[0]
                    # Calculate the difference in months
                    months_diff = (row['Receive_Date'] - latest_pmn['Receive_Date']).days / 30
                    if months_diff > 24:
                        return 'Quotation to be submitted > 24 months'
                    else:
                        return 'Quotation to be submitted < 24 months'
                else:
                    # No CSQ records found, calculate the difference from today
                    latest_receive_date = row['Receive_Date']
                    # if pd.notna(latest_receive_date):
                    #     latest_receive_date = latest_receive_date.tz_localize(None).normalize()
                    # else:
                    #     latest_receive_date = row['Doc Date'].tz_localize(None).normalize()
                    #     print("latest_receive_date is not a valid datetime", latest_receive_date, row, "x['data']['Doc Date']= ", x['data']['Doc Date'])
                    today = pd.Timestamp.today().tz_localize(None).normalize()  # Make today timezone-naive
                    latest_receive_date = latest_receive_date.tz_localize(None).normalize()  # Make latest_receive_date timezone-naive
                    months_diff = (today - latest_receive_date).days / 30
                    # print(row['NEC_Event_No'],months_diff)
                    if months_diff > 24:
                        return 'Quotation to be submitted > 24 months'
                    else:
                        return 'Quotation to be submitted < 24 months'

            elif row['Quotation_Status'] == 'Quotation to be assessed':
                # Filter the DataFrame for the same event and specific document type
                same_event_df = df[(df['NEC_Event_No'] == row['NEC_Event_No']) & (df['NEC_Doc_Type'] == 'QA-')]

                # Check if the DataFrame is not empty
                if not same_event_df.empty:
                    # Get the latest document
                    latest_pmn = same_event_df.sort_values(by='Receive_Date', ascending=False).iloc[0]
                    # Calculate the difference in months
                    months_diff = (row['Receive_Date'] - latest_pmn['Receive_Date']).days / 30
                    if months_diff > 24:
                        return 'Quotation to be assessed > 24 months'
                    else:
                        return 'Quotation to be assessed < 24 months'
                else:
                    latest_receive_date = row['Receive_Date']
                    today = pd.Timestamp.today().tz_localize(None).normalize()  # Make today timezone-naive
                    latest_receive_date = latest_receive_date.tz_localize(None).normalize()  # Make latest_receive_date timezone-naive
                    months_diff = (today - latest_receive_date).days / 30
                    if months_diff > 24:
                        return 'Quotation to be assessed > 24 months'
                    else:
                        return 'Quotation to be assessed < 24 months'
            else:
                return row['Quotation_Status']
        
        df['Quotation_Status'] = df.apply(lambda row: handle_quotation_status(row, df), axis=1)
        df['Receive_Date'] = df['Receive_Date'].apply(pd.to_datetime) + pd.Timedelta(hours=8)
        df['Original_Doc_No'] = df['Original_Doc_No'].astype(str).str.strip()

        # def handle_record_status(row, df):
        #     same_event_df = df[df['Original_Doc_No'] == row['Original_Doc_No']]
        #     record_statuses = same_event_df['record_status'].unique()

        #     if 'Closed' in record_statuses:
        #         status_df = pd.concat(same_event_df[same_event_df['record_status'] == 'Closed'], same_event_df[same_event_df['NEC_Doc_Type'] == 'PMN-'])
        #         return handle_ce_status(row, status_df)
        #     elif 'Receipt by Contractor' in record_statuses:
        #         status_df = pd.concat(same_event_df[same_event_df['record_status'] == 'Receipt by Contractor'], same_event_df[same_event_df['NEC_Doc_Type'] == 'PMN-']), 
        #         return handle_ce_status(row, status_df)
        #     else:
        #         return handle_ce_status(row, same_event_df)

        def handle_ce_status(row, df):
            acceptable_record_statuses = ['Closed', 'Receipt by Contractor']
            same_event_df = df[
                (df['Original_Doc_No'] == row['Original_Doc_No']) &
                (df['record_status'].isin(acceptable_record_statuses))
            ]
            same_event_df['NEC_Doc_Type'] = (
                same_event_df['NEC_Doc_Type']
                    .astype(str)
                    .str.strip()
                    .str.upper()
            )
            doc_types = same_event_df['NEC_Doc_Type'].unique()
            same_event_df.sort_values(by='Receive_Date', axis=0, ascending=False, inplace=True)
            today = pd.Timestamp.today().tz_localize(None).normalize()

            # if row['Original_Doc_No'] == 'CE-1054':
            #     print("NEC Event No:", row['NEC_Event_No'])
            #     print("Original_Doc_No is CE-1054")
            #     print("Doc Types:", doc_types)
            #     print("Row data:", row)

            if 'PMN-' in doc_types and 'QA-' in doc_types:
                latest_row = same_event_df[same_event_df['NEC_Doc_Type'] == 'QA-'].iloc[0]
                try:
                    latest_row_PMN = same_event_df[same_event_df['NEC_Doc_Type'] == 'PMN-'].iloc[0]
                except IndexError:
                    latest_row_PMN = None  # Handle the case where no 'PMN-' row exists
                # if row['Original_Doc_No'] == 'CE-1054':
                #     print("Latest Row QA:", latest_row)
                if row['NEC_Doc_Type'] == 'QA-' and row['Receive_Date'] == latest_row['Receive_Date']:
                    if latest_row_PMN is None:
                        return 'CE implemented', ''
                    return 'CE implemented', latest_row_PMN['Subject']

            elif 'PMN-' in doc_types and 'CSQ-' in doc_types:
                latest_row = same_event_df[same_event_df['NEC_Doc_Type'] == 'CSQ-'].iloc[0]
                # Check if there are any rows with 'NEC_Doc_Type' == 'PMI-'
                pmn_rows = same_event_df[same_event_df['NEC_Doc_Type'] == 'PMN-']

                if row['NEC_Doc_Type'] == 'CSQ-' and row['Receive_Date'] == latest_row['Receive_Date'] and not pmn_rows.empty:
                    latest_row_compare = pmn_rows.iloc[0]
                    latest_receive_date = row['Receive_Date'].tz_localize(None).normalize()
                    latest_receive_date_compare = latest_row_compare['Receive_Date'].tz_localize(None).normalize()
                    months_diff = (today - latest_receive_date_compare).days / 365
                    if months_diff > 2:
                        if latest_row_compare is None:
                            return 'Quotation to be assessed > 24 months', ''
                        return 'Quotation to be assessed > 24 months', latest_row_compare['Subject']
                    else:
                        if latest_row_compare is None:
                            return 'Quotation to be assessed < 24 months', ''
                        return 'Quotation to be assessed < 24 months', latest_row_compare['Subject']

            elif 'PMN-' in doc_types and 'PMIQ-' in doc_types:
                latest_row = same_event_df[same_event_df['NEC_Doc_Type'] == 'PMIQ-'].iloc[0]
                try:
                    latest_row_PMN = same_event_df[same_event_df['NEC_Doc_Type'] == 'PMN-'].iloc[0]
                except IndexError:
                    latest_row_PMN = None  # Handle the case where no 'PMN-' row exists
                
                if row['NEC_Doc_Type'] == 'PMIQ-' and (row['Receive_Date'] == latest_row['Receive_Date'] or row['Receive_Date'] == latest_row_PMN['Receive_Date']):
                    # Calculate the difference in months
                    latest_receive_date = row['Receive_Date'].tz_localize(None).normalize()
                    months_diff = (today - latest_receive_date).days / 365
                    if months_diff > 2:
                        if latest_row_PMN is None:
                            return 'Quotation to be submitted > 24 months', row['Subject']
                        return 'Quotation to be submitted > 24 months', latest_row_PMN['Subject']
                    else:
                        if latest_row_PMN is None:
                            return 'Quotation to be submitted < 24 months', row['Subject']
                        return 'Quotation to be submitted < 24 months', latest_row_PMN['Subject']

            elif 'PMN-' in doc_types:
                latest_row = same_event_df[same_event_df['NEC_Doc_Type'] == 'PMN-'].iloc[0]
                try:
                    latest_row_PMN = same_event_df[same_event_df['NEC_Doc_Type'] == 'PMN-'].iloc[0]
                except IndexError:
                    latest_row_PMN = None  # Handle the case where no 'PMN-' row exists
                if row['NEC_Doc_Type'] == 'PMN-' and (row['Receive_Date'] == latest_row['Receive_Date'] or row['Receive_Date'] == latest_row_PMN['Receive_Date']):
                    # Calculate the difference in months
                    latest_receive_date = row['Receive_Date'].tz_localize(None).normalize()
                    months_diff = (today - latest_receive_date).days / 365
                    if months_diff > 2:
                        if latest_row_PMN is None:
                            return 'Quotation to be submitted > 24 months', row['Subject']
                        return 'Quotation to be submitted > 24 months', latest_row_PMN['Subject']
                    else:
                        if latest_row_PMN is None:
                            return 'Quotation to be submitted < 24 months', row['Subject']
                        return 'Quotation to be submitted < 24 months', latest_row_PMN['Subject']
            
            elif 'PMI-' in doc_types:
                latest_row = same_event_df[same_event_df['NEC_Doc_Type'] == 'PMI-'].iloc[0]
                try:
                    latest_row_PMN = same_event_df[same_event_df['NEC_Doc_Type'] == 'PMN-'].iloc[0]
                except IndexError:
                    latest_row_PMN = None  # Handle the case where no 'PMN-' row exists
                if row['NEC_Doc_Type'] == 'PMI-' and (row['Receive_Date'] == latest_row['Receive_Date'] or row['Receive_Date'] == latest_row_PMN['Receive_Date']) and row['NEC_Clause'] != '61.2':
                    # Calculate the difference in months
                    latest_receive_date = row['Receive_Date'].tz_localize(None).normalize()
                    months_diff = (today - latest_receive_date).days / 365
                    if months_diff > 2:
                        if latest_row_PMN is None:
                            return 'Quotation to be submitted > 24 months', row['Subject']
                        return 'Quotation to be submitted > 24 months', latest_row_PMN['Subject']
                    else:
                        if latest_row_PMN is None:
                            return 'Quotation to be submitted < 24 months', row['Subject']
                        return 'Quotation to be submitted < 24 months', latest_row_PMN['Subject']
                if row['NEC_Doc_Type'] == 'PMI-' and row['NEC_Clause'] != '61.2':       
                    # Print the NEC_Clause of the current row
                    if latest_row_PMN is None:
                        return 'CE to be notified', row['Subject']
                    return 'CE to be notified', latest_row_PMN['Subject']
                
            elif 'NCE-' in doc_types:
                latest_row = same_event_df[same_event_df['NEC_Doc_Type'] == 'NCE-'].iloc[0]
                try:
                    latest_row_PMN = same_event_df[same_event_df['NEC_Doc_Type'] == 'PMN-'].iloc[0]
                except IndexError:
                    latest_row_PMN = None  # Handle the case where no 'PMN-' row exists
                if row['NEC_Doc_Type'] == 'NCE-' and row['NEC_Clause'] != '61.2':       
                # Print the NEC_Clause of the current row
                    if latest_row_PMN is None:
                        return 'CE to be notified', row['Subject']
                    return 'CE to be notified', latest_row_PMN['Subject']
                
            elif doc_types is not None:
                return '', row['Subject']

            return '', ''
        

        df['CE_Status'], df['Subject'] = zip(*df.apply(lambda row: handle_ce_status(row, df), axis=1))
        # df[['CE_Status', 'PMN_Subject']] = df.apply(lambda row: handle_ce_status(row, df), axis=1)
        # df['CE_Status'] = df.apply(lambda row: handle_ce_status(row, df), axis=1)
        df['Receive_Date'] = df['Receive_Date'].apply(pd.to_datetime) + pd.Timedelta(hours=8)
        
        df.columns = df.columns.str.replace(' ', '_').str.replace('.', '').str.replace('(', '_').str.replace(')', '').str.replace('%', 'percent').str.replace('/', '_')
        df.to_sql('c5_nec_cas', con=conn, if_exists='replace', index= False)
    conn.close()
    
    # Risk Registry
    resData = getWorkflowData(token, "638b33a4a1faf60c870388c2")
    db = create_engine(conn_string)
    conn = db.connect()
    df = pd.DataFrame()
    with conn as conn:
        for x in resData:
            df_nested_list = json_normalize(x['data'])
            df2 = df_nested_list
            if x['data']['Date of Early Warning'] == None:
                Date_of_Early_Warning = datetime.now(timezone.utc)
            else: 
                Date_of_Early_Warning = datetime.strptime(x['data']['Date of Early Warning'], '%Y-%m-%dT%H:%M:%S.%f%z')
            if x['data']['Date of Close of EW'] == None:
                Date_of_Close_of_EW = datetime.now(timezone.utc)
            else: 
                Date_of_Close_of_EW = datetime.strptime(x['data']['Date of Close of EW'], '%Y-%m-%dT%H:%M:%S.%f%z')

            if (Date_of_Close_of_EW - Date_of_Early_Warning) > np.timedelta64(24, 'h'):
                df2['Elapsed_Time'] = ((Date_of_Close_of_EW - Date_of_Early_Warning))
            else:
                df2['Elapsed_Time'] = np.timedelta64(0, 'D')
            if (df2['Elapsed_Time'] >= np.timedelta64(365, 'D')).bool():
                df2['Elapsed_Time_more_then_1_year'] = True
            else:
                df2['Elapsed_Time_more_then_1_year'] = False
            df2['Elapsed_Time'] = df2['Elapsed_Time'] / 1000 / 1000 / 86400000
            df = df.append(df2)

        df['Date of Close of EW']=df['Date of Close of EW'].apply(pd.to_datetime)
        df['Date of Close of EW'] = df['Date of Close of EW'] - pd.Timedelta(hours=8)
        df['Date of Early Warning']=df['Date of Early Warning'].apply(pd.to_datetime)
        df['Date of Early Warning'] = df['Date of Early Warning'] - pd.Timedelta(hours=8)
        # df['Action Party (CEDD / AECOM / CW-KL JV)']=np.array(df['Action Party (CEDD / AECOM / CRCC-PY JV)'].tolist())
        df.columns = df.columns.str.replace(' ', '_').str.replace('.', '_').str.replace('(', '_').str.replace(')', '').str.replace('/', '_').str.replace('%', 'percent')
        # df['Action_Party___CEDD_/_AECOM_/_DCK_JV']=np.array(df['Action_Party___CEDD_/_AECOM_/_DCK_JV'].tolist())
        df.to_sql('c5_nec_risk_register', con=conn, if_exists='replace', index= False)
    conn.close()
    # resData = getWorkflowData(token, "638b33a4a1faf60c870388c2")
    # db = create_engine(conn_string)
    # conn = db.connect()
    # df = pd.DataFrame()
    # with conn as conn:
    #     for x in resData:
    #         df_nested_list = json_normalize(x['data'])
    #         df2 = df_nested_list
    #         if x['data']['Date of Early Warning'] == None:
    #             Date_of_Early_Warning = datetime.now(timezone.utc)
    #         else: 
    #             Date_of_Early_Warning = datetime.strptime(x['data']['Date of Early Warning'], '%Y-%m-%dT%H:%M:%S.%f%z')
    #         if x['data']['Date of Close of EW'] == None:
    #             Date_of_Close_of_EW = datetime.now(timezone.utc)
    #         else: 
    #             Date_of_Close_of_EW = datetime.strptime(x['data']['Date of Close of EW'], '%Y-%m-%dT%H:%M:%S.%f%z')
    #         # print((Date_of_Close_of_EW - Date_of_Early_Warning))
    #         if (Date_of_Close_of_EW - Date_of_Early_Warning) > np.timedelta64(24, 'h'):
    #             df2['Elapsed_Time'] = ((Date_of_Close_of_EW - Date_of_Early_Warning))
    #         else:
    #             df2['Elapsed_Time'] = np.timedelta64(0, 'D')
    #         if (df2['Elapsed_Time'] >= np.timedelta64(365, 'D')).bool():
    #             df2['Elapsed_Time_more_then_1_year'] = True
    #         else:
    #             df2['Elapsed_Time_more_then_1_year'] = False
    #         df2['Elapsed_Time'] = df2['Elapsed_Time'] / 1000 / 1000 / 86400000
    #         df = df.append(df2)

    #     df['Date of Close of EW']=df['Date of Close of EW'].apply(pd.to_datetime)
    #     df['Date of Close of EW'] = df['Date of Close of EW'] - pd.Timedelta(hours=8)
    #     df['Date of Early Warning']=df['Date of Early Warning'].apply(pd.to_datetime)
    #     df['Date of Early Warning'] = df['Date of Early Warning'] - pd.Timedelta(hours=8)
    #     # df['Action Party (CEDD / AECOM / CW-KL JV)']=np.array(df['Action Party (CEDD / AECOM / CRCC-PY JV)'].tolist())
    #     df.columns = df.columns.str.replace(' ', '_').str.replace('.', '_').str.replace('(', '_').str.replace(')', '').str.replace('/', '_').str.replace('%', 'percent')
    #     # df['Action_Party___CEDD_/_AECOM_/_DCK_JV']=np.array(df['Action_Party___CEDD_/_AECOM_/_DCK_JV'].tolist())
    #     df.to_sql('c5_nec_risk_register', con=conn, if_exists='replace', index= False)


# */2 * * * * Execute every two minute 
with DAG(
        dag_id="c5_nec",
        schedule_interval="0 0,4,8,11,16 * * *",
        default_args={
            "owner": "airflow",
            "retries": 1,
            "retry_delay": timedelta(minutes=5),
            "start_date": datetime(2022, 10, 24)
        },
        catchup=False) as f:
    
    pipelineProcess = PythonOperator(
        task_id="pipelineProcess",
        python_callable=pipelineProcess,
        provide_context=True,
    )
    
    # getWorkflowRecords = PythonOperator(
    #     task_id="getWorkflowRecords",
    #     python_callable=getWorkflowRecords,
    #     provide_context=True,
    # )

    getDrowToken = PythonOperator(
        task_id="getDrowToken",
        python_callable=getDrowToken,
        provide_context=True,
        # op_kwargs={"name": "Dylan"}
    )
    

    # create_table = PostgresOperator(
    #     sql = create_table_sql_query,
    #     task_id = "create_table_task",
    #     postgres_conn_id = "postgres_rds",
    # )

    # insert_data = PostgresOperator(
    #     sql = insert_data_sql_query,
    #     task_id = "insertData_sql_query_task",
    #     postgres_conn_id = "postgres_rds",
    # )

# getDrowToken >> pipelineProcess >> getWorkflowRecords
getDrowToken >> pipelineProcess
DAG: c5_nec

schedule: 0 0,4,8,11,16 * * *

c5_nec