#!/usr/bin/env python
# +-======-+ 
#  Copyright (c) 2003-2007 United States Government as represented by 
#  the Admistrator of the National Aeronautics and Space Administration.  
#  All Rights Reserved.
#  
#  THIS OPEN  SOURCE  AGREEMENT  ("AGREEMENT") DEFINES  THE  RIGHTS  OF USE,
#  REPRODUCTION,  DISTRIBUTION,  MODIFICATION AND REDISTRIBUTION OF CERTAIN 
#  COMPUTER SOFTWARE ORIGINALLY RELEASED BY THE UNITED STATES GOVERNMENT AS 
#  REPRESENTED BY THE GOVERNMENT AGENCY LISTED BELOW ("GOVERNMENT AGENCY").  
#  THE UNITED STATES GOVERNMENT, AS REPRESENTED BY GOVERNMENT AGENCY, IS AN 
#  INTENDED  THIRD-PARTY  BENEFICIARY  OF  ALL  SUBSEQUENT DISTRIBUTIONS OR 
#  REDISTRIBUTIONS  OF THE  SUBJECT  SOFTWARE.  ANYONE WHO USES, REPRODUCES, 
#  DISTRIBUTES, MODIFIES  OR REDISTRIBUTES THE SUBJECT SOFTWARE, AS DEFINED 
#  HEREIN, OR ANY PART THEREOF,  IS,  BY THAT ACTION, ACCEPTING IN FULL THE 
#  RESPONSIBILITIES AND OBLIGATIONS CONTAINED IN THIS AGREEMENT.
#  
#  Government Agency: National Aeronautics and Space Administration
#  Government Agency Original Software Designation: GSC-15354-1
#  Government Agency Original Software Title:  GEOS-5 GCM Modeling Software
#  User Registration Requested.  Please Visit http://opensource.gsfc.nasa.gov
#  Government Agency Point of Contact for Original Software:  
#  			Dale Hithon, SRA Assistant, (301) 286-2691
#  
# +-======-+ 
"""Check obsys_rc file against available data."""

import argparse
import glob
import obsys_rc
import re
import time

global zeros
zeros = "00000000_0000z"

#.......................................................................
def check(filename="obsys.rc",
          checkfile="default",
          newfile="default",
          obslist=["all"],
          ignore_gap=[],
          write_comments=True,
          show_miss_data=True,
          show_misfiles=True,
          show_gaps=False,
          show_found_data=True,
          suppress_date=False):
    """
    Check obsys_rc file against available data.

    input parameters
    => filename: name of obsys_rc to check
    => checkfile: name of output check file; defaults to filename+".check"
    => newfile: name of output new file; defaults to filename+".new"
    => obslist: list of observation classes to process
    => ignore_gap: list of obsclass[<threshold] values where data gaps less
                   than threshold (hours) will be ignored; default threshold
                   = 24 (hours)
    => write_comments: (boolean) write obsys_rc table comments to output
    => show_miss_data: (int string "N" or False) show dates of
                       missing data up to N days before "today"
    => show_misfiles: (boolean) show misfile info
    => show_gaps: (boolean) show all data gaps
    => show_found_data: (boolean) show data found in data gaps
    => suppress_date: (boolean) do not display lastday in output;
                      (for testing purposes only)
    """

    _set_global_pattern_dictionary()
    global pattern

    if checkfile == "default":
        checkfile = filename + ".check"

    if checkfile == filename:
        msg = "Cannot write check info back to the same file: {0}"
        raise ValueError(msg.format(checkfile))

    if newfile == "default":
        newfile = filename + ".new"

    if newfile == filename:
        msg = "Cannot write new info back to the same file: {0}"
        raise ValueError(msg.format(newfile))

    threshold = {}
    for obsStr in ignore_gap:
        if "<" in obsStr:
            (obsclass, thresh) = obsStr.split('<')
        else:           
            obsclass = obsStr
            thresh = 24

        threshold[obsclass] = thresh
#        try:
#            threshold[obsclass] = thresh
#        except:
#            threshold = { obsclass : thresh }

    today = time.strftime("%Y%m%d_0000z")
    if show_miss_data:
        try:
            num_hours = int(show_miss_data)*24 * -1
            lastday = incr_datetime(today, num_hours)
        except:
            lastday = today

    # load input obsys_rc file
    #-------------------------
    obsys = obsys_rc.Load(filename)

    # open output file
    #-----------------
    checkfl = open(checkfile, mode='w')
    newfl = open(newfile, mode='w')

    # loop thru data from obsys_rc file
    #----------------------------------
    for (obsclass, recvals) in obsys.obsinfo():
        if "all" not in obslist and obsclass not in obslist:
            continue

        # set thresh gap value
        #---------------------
        thresh = threshold.get(obsclass, 0)

        print("\nChecking {0}".format(obsclass))

        # output prolog lines, table start, and comments
        #-----------------------------------------------
        if recvals["prolog"]:
            for line in recvals["prolog"]:
                checkfl.write(line+'\n')
                newfl.write(line+'\n')

        checkfl.write("BEGIN {0} => {1}\n".format(obsclass, recvals["outtmpl"]))
        newfl.write("BEGIN {0} => {1}\n".format(obsclass, recvals["outtmpl"]))
        if write_comments:
            for line in recvals["comments"]:
                checkfl.write(line+"\n")
                newfl.write(line+"\n")

        # sort obsclass rows by template
        #----------------------------------------------
        # NOTE: an obsclass can have multiple templates
        #----------------------------------------------
        template_start_stop_list = []
        for row in recvals["rows"]:
            (start_stop, interval, template) = row.split()
            (start, stop) = pattern["start_stop"].search(start_stop).groups()
            start += "00"
            stop  += "00"

            if dict(template_start_stop_list).has_key(template):
                dict(template_start_stop_list)[template].append((start, stop))
            else:
                template_start_stop_list.append((template, [(start, stop)]))

        # process and output rows by template
        #------------------------------------
        for (template, start_stop_list) in template_start_stop_list:
            print("=> {0}".format(template))

            # write original data availability info to checkfile
            #---------------------------------------------------
            for (start, stop) in start_stop_list:
                start_ = start[0:11]
                stop_ = stop[0:11]
                line = "  %sz-%sz %s %s\n"%(start_, stop_, interval, template)

                if start == zeros:
                    line = '#' + line[1:]
                checkfl.write(line)

            # write actual data availability info to newfile
            #-----------------------------------------------
            (start_stop_data, deltaMin, misfiles) = _get_data_info(template, thresh)

            hh = deltaMin / 60
            nn = deltaMin % (hh*60)
            interval_actual = "%02d%02d00"%(hh, nn)

            for (start, stop) in start_stop_data:
                start_ = start[0:11]
                stop_ = stop[0:11]
                line = "  %sz-%sz %s %s\n"%(start_, stop_, interval_actual, template)

                if start == zeros:
                    line = '#' + line[1:]
                newfl.write(line)

            # look for gap information
            #-------------------------
            first = min(start_stop_list[0][0],  start_stop_data[0][0])
            final = max(start_stop_list[-1][1], start_stop_data[-1][1])

            if final > today:
                final = today

            if final < first:
                msg = "first datetime is greater than final: {0} > {1}"
                raise ValueError(msg.format(first, final))

            gaps = []
            miss_data = []
            found_data = []

            datetime = first
            while datetime <= final:
                if _not_included(start_stop_list, datetime):

                    if _not_included(start_stop_data, datetime):
                        gaps.append(datetime)
                    else:
                        found_data.append(datetime)

                elif _not_included(start_stop_data, datetime):
                    miss_data.append(datetime)
                    gaps.append(datetime)

                datetime = incr_datetime(datetime, deltaMin)
                
            border1 = "  #"+"-"*26+"\n"
            border2 = "  #"+"="*26+"\n"

            # write miss data info to output
            #-------------------------------
            if show_miss_data and miss_data:

                msg = "  # MISSING DATA"
                if not suppress_date:
                    msg += "\n  # start  < %s"%(lastday)
                    msg += "\n  # or end < %s"%(today)
                msg += "\n"
                miss_tuples = _start_stop_tuples(miss_data, deltaMin)

                wrote_info = False
                for (start, stop) in miss_tuples:

                    if stop != today or start < lastday:
                        if not wrote_info:
                            checkfl.write(border2+msg+border1)
                            wrote_info = True

                        start_ = start[0:11]
                        stop_ = stop[0:11]

                        if start == stop:
                            checkfl.write("  %sz\n"%(start_))
                        else:
                            checkfl.write("  %sz-%sz\n"%(start_, stop_))

                if wrote_info:
                    checkfl.write(border1+'\n')

            # write all gap info to output
            #-----------------------------
            if show_gaps and gaps:
                msg =  "  # all data gaps\n"
                msg += "  # including MISSING DATA\n"
                checkfl.write(border2+msg+border1)

                gap_tuples = _start_stop_tuples(gaps, deltaMin)
                for (start, stop) in gap_tuples:
                    start_ = start[0:11]
                    stop_ = stop[0:11]
                    if start == stop:
                        checkfl.write("  %s\n"%(start_))
                    else:
                        checkfl.write("  %sz-%sz\n"%(start_, stop_))
                checkfl.write(border1+'\n')

            # write found data info to output
            #--------------------------------
            if show_found_data and found_data:
                msg = "  # MORE DATA FOUND\n"
                checkfl.write(border2+msg+border1)

                nogap_tuples = _start_stop_tuples(found_data, deltaMin)
                for (start, stop) in nogap_tuples:
                    if start == stop:
                        checkfl.write("  %s\n"%(start))
                    else:
                        checkfl.write("  %sz-%sz\n"%(start, stop))
                checkfl.write(border1+'\n')

            # write misfile info to output
            #-----------------------------
            if show_misfiles and misfiles:
                msg  = "  # MISFILES\n"
                checkfl.write(border2+msg+border1)
                misfiles.sort()
                for mis in misfiles:
                    checkfl.write("  #"+mis+"\n")
                checkfl.write(border1+'\n')

        # output table end
        #-----------------
        checkfl.write("END\n")
        checkfl.flush()

        newfl.write("END\n")
        newfl.flush()

    checkfl.close()
    newfl.close()

#.......................................................................
def _csplit(strval, char=','): return strval.split(char)

#.......................................................................
def _get_data_info(template, thresh):
    """
    Return list of (start, stop) datetime tuples, plus delta value (in minutes)
    and list of misplaced and misnamed data.

    input parameters
    => template: data path/name template
    => thresh: data gap threshold (hours); ignore data gaps < thresh
    """
    global zeros

    # regular expression patterns to find date/time
    #----------------------------------------------
    global pattern

    # get list of data filepaths
    #---------------------------
    index = template.find(':')+1
    tmpl = template[index:]
    tmpl = tmpl.replace("%y4", "????").replace("%y2", "??").replace("%m2", "??")
    tmpl = tmpl.replace("%d2", "??").replace("%h2", "??").replace("%n2", "??")
    tmpl = tmpl.replace("%j3", "???")

    filepath_list = []
    filepath_list = glob.glob(tmpl)

    # extract datetimes from available data
    #--------------------------------------
    times_found = {}
    datetime_list = []
    misfiles = []

    if filepath_list:
        for fpath in filepath_list:
            min = "00"

            if pattern["yyyymmdd_hhz"].search(fpath):
                returnVals = pattern["yyyymmdd_hhz"].search(fpath).groups()
                (year, month, day, hour) = returnVals

                if pattern["Y4_M2"].search(fpath):
                    (yyyy, mm) = pattern["Y4_M2"].search(fpath).groups()

                    if year != yyyy or month != mm:
                        misfiles.append("(MISPLACED) "+fpath)
                        continue

            elif pattern["yyyymmdd_hh"].search(fpath):
                returnVals = pattern["yyyymmdd_hh"].search(fpath).groups()
                (year, month, day, hour) = returnVals
                if pattern["Y4_M2"].search(fpath):
                    (yyyy, mm) = pattern["Y4_M2"].search(fpath).groups()

                    if year != yyyy or month != mm:
                        misfiles.append("(MISPLACED) "+fpath)
                        continue
 
            elif pattern["Y4_M2"].search(fpath) and pattern["yyyymmdd__hhz"].search(fpath):
                (year, month) = pattern["Y4_M2"].search(fpath).groups()
                (yyyy, mm, day, hour) = pattern["yyyymmdd__hhz"].search(fpath).groups()

                if year != yyyy or month != mm:
                    misfiles.append("(MISPLACED) "+fpath)
                    continue

            elif pattern["Y4_M2"].search(fpath) and pattern["yymmdd__hhz"].search(fpath):
                (year, month) = pattern["Y4_M2"].search(fpath).groups()
                (yy, mm, day, hour) = pattern["yymmdd__hhz"].search(fpath).groups()

                if year[2:4] != yy or month != mm:
                    misfiles.append("(MISPLACED) "+fpath)
                    continue

            elif pattern["Y4_M2"].search(fpath) and pattern["yyyymmdd"].search(fpath):
                (year, month) = pattern["Y4_M2"].search(fpath).groups()
                (yyyy, mm, day) = pattern["yyyymmdd"].search(fpath).groups()
                hour = "00"

                if year != yyyy or month != mm:
                    misfiles.append("(MISPLACED) "+fpath)
                    continue

            elif pattern["yyyy_jjj"].search(fpath) and pattern["yyyyjjj_hhnn"].search(fpath):
                (year, JJJ) = pattern["yyyy_jjj"].search(fpath).groups()
                (yyyy, jjj, hour, min) = pattern["yyyyjjj_hhnn"].search(fpath).groups()

                if year != yyyy or JJJ != jjj:
                    misfiles.append("(MISPLACED) "+fpath)
                    continue

                (month, day) = jjj2mmdd(yyyy, jjj)

            elif pattern["Y4_M2_D2"].search(fpath) and pattern["hhz"].search(fpath):
                (year, month, day) = pattern["Y4_M2_D2"].search(fpath,1).groups()
                (hour,) = pattern["hhz"].search(fpath).groups()

            elif pattern["Y4_M2"].search(fpath):
                (year, month) = pattern["Y4_M2"].search(fpath).groups()

                ddhh_string = year+month+r"(\d{2})(\d{2})"
                ddhh_pattern = re.compile(ddhh_string)
                (day, hour) = ddhh_pattern.search(fpath).groups()

            else:
                msg = "Cannot extract date/time from fpath: {0}"
                raise ValueError(msg.format(fpath))

            numdays = num_days_in_month(int(year), int(month))
            if int(month) < 1 or int(month) > 12 or \
                    int(day) < 1 or int(day) > numdays:
                misfiles.append("(MISLABELED) "+fpath)
                continue
                    
            datetime = year+month+day+'_'+hour+min
            datetime_list.append(datetime)
            times_found[hour+min] = 1

    # determine deltaMin
    #-------------------
    if times_found:
        num_times = len(times_found.keys())
        deltaMin = (24.*60.)/num_times
    
        if deltaMin != int(deltaMin):
            msg = "Non-divisible number of hour+min times found: {0} for {1]"
            raise ValueError(msg.format(num_times, template))
    else:
        datetime_list = [zeros]
        deltaMin = 24*60

    deltaMin = max(int(deltaMin), thresh*60)

    start_stop_data = _start_stop_tuples(datetime_list, deltaMin)
    return (start_stop_data, deltaMin, misfiles)

#.......................................................................
def _start_stop_tuples(datetime_list, deltaMin):
    datetime_list.sort()

    start = datetime_list[0]
    previous = start
    next = incr_datetime(start, deltaMin)

    tuple_list = []
    for datetime in datetime_list[1:]:
        if datetime > next:
            tuple_list.append((start, previous))
            start = datetime

        previous = datetime
        next = incr_datetime(datetime, deltaMin)

    tuple_list.append((start, previous))

    return tuple_list

#.......................................................................
def _not_included(date_ranges, datetime):
    """Return False if datetime is not in any of the included date_ranges"""
    for (start, stop) in date_ranges:
        if datetime >= start and datetime <= stop:
            return False
    return True

#.......................................................................
def _set_global_pattern_dictionary():
    """Create patterns to be used in regexp searches to find date/time."""
    global pattern
    pattern = {}

    # regular expression strings
    #---------------------------
    start_stop_string    = r"(\d{8}_\d{2})z-(\d{8}_\d{2})z"
    Y4_M2_string         = r"/Y(\d{4})/M(\d{2})/"
    Y4_M2_D2_string      = r"/Y(\d{4})/M(\d{2})/D(\d{2})/"
    hhz_string           = r"\D(\d{2})z"
    yymmdd__hhz_string   = r"\D(\d{2})(\d{2})(\d{2})\D+(\d{2})z"
    yyyymmdd__hhz_string = r"\D(\d{4})(\d{2})(\d{2})\D+(\d{2})z"
    yyyymmdd_hh_string   = r"(\d{4})(\d{2})(\d{2})\D(\d{2})"
    yyyymmdd_hhz_string  = r"(\d{4})(\d{2})(\d{2})\D(\d{2})z"
    yyyymmdd_string      = r"(\d{4})(\d{2})(\d{2})\D"
    yyyyjjj_hhnn_string  = r"(\d{4})(\d{3})\.(\d{2})(\d{2})\D"
    yyyy_jjj_string      = r"/(\d{4})/(\d{3})/"

    # store regular expression patterns in global variable
    #-----------------------------------------------------
    pattern["start_stop"]    = re.compile(start_stop_string)
    pattern["Y4_M2"]         = re.compile(Y4_M2_string)
    pattern["Y4_M2_D2"]      = re.compile(Y4_M2_D2_string)
    pattern["hhz"]           = re.compile(hhz_string)
    pattern["yymmdd__hhz"]   = re.compile(yymmdd__hhz_string)
    pattern["yyyymmdd__hhz"] = re.compile(yyyymmdd__hhz_string)
    pattern["yyyymmdd_hh"]   = re.compile(yyyymmdd_hh_string)
    pattern["yyyymmdd_hhz"]  = re.compile(yyyymmdd_hhz_string)
    pattern["yyyymmdd"]      = re.compile(yyyymmdd_string)
    pattern["yyyyjjj_hhnn"]  = re.compile(yyyyjjj_hhnn_string)
    pattern["yyyy_jjj"]      = re.compile(yyyy_jjj_string)

#.......................................................................
def incr_datetime(datetime, deltaMin):
    """
    Increment datetime by delta minutes and return the value.

    input parameters
    => datetime: date/time in yyyymmdd_hhz format
    => deltaMin: integer number of minutes to add to datetime

    return value
    => new_datetime: format yyyymmdd_hhz
    """
    year  = int(datetime[0:4])
    month = int(datetime[4:6])
    day   = int(datetime[6:8])
    try:
        hour = int(datetime[9:11])
        min  = int(datetime[11:13])
    except ValueError:
        msg = "EXCEPTION: datetime = {0}"
        raise ValueError(msg.format(datetime))

    min += deltaMin
    while min > 59:
        min -= 60
        hour += 1

    while hour > 23:
        hour -= 24
        day += 1

        if day > num_days_in_month(year, month):
            day = 1
            month += 1

            if month > 12:
                month = 1
                year += 1

    while hour < 0:
        hour += 24
        day -= 1

        if day < 1:
            month -= 1

            if month < 1:
                month = 12
                year -=1

            day = num_days_in_month(year, month)

    return "%04d%02d%02d_%02d%02d"%(year, month, day, hour, min)

#.......................................................................
def num_days_in_month(year, month):
    numdays = [0, 31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31]

    if month==2 and year%4==0 and (year%100!=0 or year%400==0):
        numdays[2] = 29

    return numdays[month]

#.......................................................................
def jjj2mmdd(year, jjj):
    dd = int(jjj)
    for mm in range(1, 13):
        last = num_days_in_month(int(year), mm)

        if dd <= last:
            break

        if mm == 12:
            msg = "Day-of-year value is too large: year = {0}, jjj = {1}"
            raise ValueError(msg.format(year, jjj))

        dd -= last

    month = str("%02d"%mm)
    day = str("%02d"%dd)

    return (month, day)       

#.......................................................................
if __name__ == "__main__":
    """Check obsys_rc file against available data."""

    # get calling parameters
    #-----------------------
    ArgumentDefaults = argparse.ArgumentDefaultsHelpFormatter
    parser = argparse.ArgumentParser(description=__doc__,
                                     formatter_class=ArgumentDefaults)
    #=========
    # filename
    #=========
    parser.add_argument("filename",
                        nargs='?',
                        type=str,
                        default="obsys.rc",
                        help="name of obsys_rc file")
    #==========
    # checkfile
    #==========
    parser.add_argument("checkfile",
                        nargs='?',
                        type=str,
                        default='filename+".check"',
                        help='name of output file')
    #========
    # newfile
    #========
    parser.add_argument("newfile",
                        nargs='?',
                        type=str,
                        default='filename+".new"',
                        help='name of output file')
    #==========
    # --obslist
    #==========
    parser.add_argument("--obslist",
                        metavar="obsclass_list",
                        type=_csplit,
                        default="all",
                        help="""list of observation classes to process,
                                separated by commas, no spaces""")
    #=============
    # --ignore_gap
    #=============
    parser.add_argument("--ignore_gap",
                        metavar="obsclass_hourgap_threshold_list",
                        type=_csplit,
                        default=[],
                        help="""list of obsclass[<threshold] values where data gaps
                                less than threshold (hours) will be ignored; default
                                threshold = 24 (hours); multiple values separated
                                by commas, no spaces""")
    #===========
    # --comments
    #===========
    help_msg = "write obsys_rc table comments to output; 0 to turn off"
    parser.add_argument("--comments",
                        nargs='?',
                        metavar="0",
                        action="store",
                        type = int,
                        const=True,       # for --comments flag without value
                        default=True,     # for when --comments flag is missing
                        help = help_msg)
    #=======
    # --miss
    #=======
    help_msg = """show dates of missing data up to N days before "today";
                  "False" to turn off"""
    days_before_today = 7
    parser.add_argument("--miss",
                        nargs='?',
                        metavar="N or False",
                        const=days_before_today,
                        default=days_before_today,
                        help = help_msg)
    #===========
    # --misfiles
    #===========
    parser.add_argument("--misfiles",
                        nargs='?',
                        metavar="0",
                        type=int,
                        const=True,
                        default=True,
                        help = "show misfile information; 0 to turn off")
    #=======
    # --gaps
    #=======
    parser.add_argument("--gaps",
                        action="store_true",
                        default=False,
                        help = "show data gaps")
    #========
    # --found
    #========
    parser.add_argument("--found",
                        nargs='?',
                        metavar="0",
                        type=int,
                        const=True,
                        default=True,
                        help = "show data found in the gaps; 0 to turn off")
    #================
    # --suppress_date
    #================
    help_msg = "Do not show dates in MISSING DATA title (testing only)"
    parser.add_argument("--suppress_date",
                        action="store_true",
                        default=False,
                        help=help_msg)

    # extract calling parameters
    #---------------------------
    args = parser.parse_args()

    filename        = args.filename
    obslist         = args.obslist
    ignore_gap      = args.ignore_gap
    show_found_data = args.found
    show_misfiles   = args.misfiles
    write_comments  = args.comments
    suppress_date   = args.suppress_date

    if args.checkfile == 'filename+".check"': checkfile = filename+".check"
    else:                                     checkfile = args.checkfile

    if args.newfile == 'filename+".new"': newfile = filename+".new"
    else:                                 newfile = args.newfile

    if args.gaps: show_gaps = True
    else:         show_gaps = False

    if args.miss:
        show_miss_data = args.miss
        if show_miss_data == "False":
            show_miss_data = False
        else:
            try:
                dummy = int(show_miss_data)
            except:
                show_miss_data = days_before_today

    # call check function
    #--------------------
    check(filename, checkfile, newfile, obslist, ignore_gap, write_comments,
          show_miss_data, show_misfiles, show_gaps, show_found_data,
          suppress_date)
