#!/usr/bin/env python3
# -*- coding: utf-8 -*-
#
# gplogfilter [options] [inputfile]...
#
# Copyright (c) Greenplum Inc 2008. All Rights Reserved.
#

# import Python standard library modules
import csv
import gzip
import locale
import os
import os.path
import re
import sys

from optparse import Option, OptionGroup, OptionParser, OptionValueError, SUPPRESS_USAGE

# import GPDB modules
try:
    from gppylib.gpparseopts import *
    from gppylib.datetimeutils import str_to_datetime, str_to_duration, DatetimeValueError
    from gppylib.logfilter import *
    from gppylib.commands.gp import get_coordinatordatadir
except ImportError as e:
    sys.exit('ERROR: Cannot import modules.  Please check that you have sourced greenplum_path.sh.  Detail: ' + str(e))

# These values are from cdb-pg/src/backend/po/*.po
TROUBLE_VALUES = [
    'ERROR', 'FATAL', 'PANIC',  # EN
    bytearray(b'\x46\x4F\x55\x54').decode('iso-8859-1'),  # AF "ERROR"
    bytearray(b'\x46\x41\x54\x41\x41\x4C').decode('iso-8859-1'),  # AF "FATAL"
    bytearray(b'\x57\x41\x4E\x48\x4F\x4F\x50').decode('iso-8859-1'),  # AF "PANIC"
    bytearray(b'\x46\x45\x48\x4C\x45\x52').decode('ISO-8859-1'),  # DE "ERROR"
    bytearray(b'\x46\x41\x54\x41\x4C').decode('ISO-8859-1'),  # DE "FATAL"
    bytearray(b'\x50\x41\x4E\x49\x43').decode('ISO-8859-1'),  # DE "PANIC"
    bytearray(b'\x45\x52\x52\x45\x55\x52').decode('ISO-8859-15'),  # FR "ERROR"
    bytearray(b'\x46\x41\x54\x41\x4C').decode('ISO-8859-15'),  # FR "FATAL"
    bytearray(b'\x50\x41\x4E\x49\x43').decode('ISO-8859-15'),  # FR "PANIC"
    u'오류', u'치명적오류', u'손상',  # KO
    bytearray(b'\x46\x45\x49\x4C').decode('ISO-8859-1'),  # NB "ERROR"
    bytearray(b'\x50\x41\x4E\x49\x4B\x4B').decode('ISO-8859-1'),  # NB "PANIC"
    bytearray(b'\x45\x52\x52\x4F').decode('ISO-8859-1'),  # PT_BR "ERROR"
    bytearray(b'\x46\x41\x54\x41\x4C').decode('ISO-8859-1'),  # PT_BR "FATAL"
    bytearray(b'\x50\xC2\x4E\x49\x43\x4F').decode('ISO-8859-1'),  # PT_BR "PANIC"
    bytearray(b'\x43\x48\x59\x42\x41').decode('ISO-8859-2'),  # SK "ERROR"
    bytearray(b'\x46\x41\x54\xC1\x4C\x4E\x45').decode('ISO-8859-2'),  # SK "FATAL"
    bytearray(b'\x50\x41\x4E\x49\x4B\x41').decode('ISO-8859-2'),  # SK "PANIC"
    bytearray(b'\x4E\x41\x50\x41\x4B\x41').decode('ISO-8859-2'),  # SL "ERROR"
    bytearray(b'\x55\x53\x4F\x44\x4E\x41\x20\x4E\x41\x50\x41\x4B\x41').decode('ISO-8859-2'),  # SL "FATAL"
    bytearray(b'\x50\x41\x4E\x49\x4B\x41').decode('ISO-8859-2'),  # SL "PANIC"
    bytearray(b'\x46\x45\x4C').decode('ISO-8859-1'),  # SV "ERROR"
    bytearray(b'\x46\x41\x54\x41\x4C\x54').decode('ISO-8859-1'),  # SV "FATAL"
    bytearray(b'\x50\x41\x4E\x49\x4B').decode('ISO-8859-1'),  # SV "PANIC"
    u'HATA', u'ÖLÜMCÜL', u'KRİTİK',  # TR
    bytearray(b'\xB4\xED\xCE\xF3').decode('gb2312'),  # ZH_CN "ERROR"
    bytearray(b'\xD6\xC2\xC3\xFC\xB4\xED\xCE\xF3').decode('gb2312'),  # ZH_CN "FATAL"
    bytearray(b'\xB1\xC8\xD6\xC2\xC3\xFC\xB4\xED\xCE\xF3\xBB\xB9\xB9\xFD\xB7\xD6\xB5\xC4\xB4\xED\xCE\xF3').decode(
        'gb2312'),  # ZH_CN "PANIC"
    u'錯誤', u'嚴重錯誤'  # ZH_TW
]

description = ("""
Reads GPDB log file(s), extracts log entries which meet
all the criteria you specify, and writes them to output
file(s) or standard output.
""")

_help = ["""
If no input file is specified, the value of the COORDINATOR_DATA_DIRECTORY
environment variable is used, if defined.  If an input file is a
directory, the suffix ".log" is appended to obtain the input file name.
To read from standard input, specify input file "-".
""", """
To write the extracted log entries to a destination other than stdout,
use the --out option.  If the destination is a directory, then for each
input file, a correspondingly named output file is written there;
otherwise the results from all input files are written to the specified
--out file or standard output.
""", """
In the input file, each log entry starts with a timestamp
"yyyy-mm-dd hh:mm:ss[.fraction]" at the beginning of a line.
Any following lines are considered to belong to the same entry,
up to the next line having a different timestamp.
""", """
--begin and --end timestamp values can be specified as either
"yyyy-mm-dd[ hh:mm[:ss]]" or "yyyymmdd[ hhmm[ss]]".  Between date
and time, either a space or the letter "T" is required.  When only
the date is given, 00:00:00 is used for the time.
Specify --duration as "[hours][:minutes[:seconds]]";
it is unused if both --begin and --end are given.
""", """
The ending date and time can be given by --end, or computed as
--begin plus --duration.  The beginning date and time can be
given by --begin, or computed as --end minus --duration, or
computed as the current date and time minus --duration.
""", """
Log entries are skipped unless they fulfill all of the conditions
you specify.  For example, if you specify two occurrences of the
--find option, log entries must contain both of the strings.
""", """
If the destination specified by --out is a directory, the output file is
given the same name as the input file (excluding '.gz' suffix if any).
When writing compressed output, '.gz' is suffixed to the output file name.
If you specify an output file name ending in '.gz', the output is
compressed (-z9) by default.
""", """
Example:
gplogfilter -t -d2
# view trouble messages timestamped within the past two hours
"""]


def parseargs():
    # Create our OptionParser object
    parser = OptParser(option_class=OptChecker,
                       description=' '.join(description.split()),
                       version='%prog version $Revision$')
    parser.helpStr = _help
    parser.set_usage('%prog [--help] [options] [inputfile]...')
    parser.remove_option('-h')

    # Define the options
    optgrp = OptionGroup(parser, '\n  Timestamp range',
                         'Use any two of these options to impose lower and '
                         'upper bounds on timestamps; or use any one option to '
                         'bound the timestamp range on just one side. ')
    optgrp.add_option('-b', '--begin', type='datetime', metavar='datetime',
                      help='beginning date and time: "yyyy-mm-dd[ hh:mm[:ss]"')
    optgrp.add_option('-e', '--end', type='datetime', metavar='datetime',
                      help='ending date and time')
    optgrp.add_option('-d', '--duration', type='duration', metavar='[h][:m[:s]]',
                      help='duration from beginning to end')
    optgrp.add_option('--prunefiles', action='store_true', default=False,
                      help="Discard files based on filename of the form gpdb-%Y-%m-%d_%H%M%S.csv")
    parser.add_option_group(optgrp)

    optgrp = OptionGroup(parser, 'Pattern and string matching',
                         'Log entries can be chosen depending on whether '
                         'they contain a match for a pattern (regular '
                         'expression) or string. Matching of alphabetic '
                         'characters is case-sensitive unless preceded '
                         'by --case=ignore. These options can be used as many '
                         'times as needed to apply multiple restrictions. '
                         'Regular expression syntax is documented at '
                         'http://docs.python.org/lib/re-syntax.html')
    optgrp.add_option('-f', '--find', type='literal', metavar='string',
                      dest='filters', action='MatchRegex',
                      help='select log entries containing string')
    optgrp.add_option('-F', '--nofind', type='literal', metavar='string',
                      dest='filters', action='NoMatchRegex',
                      help='reject log entries containing string')
    optgrp.add_option('-m', '--match', type='regex', metavar='regex',
                      dest='filters', action='MatchRegex',
                      help='select log entries where a match for the regex is found')
    optgrp.add_option('-M', '--nomatch', type='regex', metavar='regex',
                      dest='filters', action='NoMatchRegex',
                      help='reject log entries where a match for the regex is found')
    optgrp.add_option('-t', '--trouble', action='store_true',
                      help='select log entries having ERROR:, FATAL:, or PANIC: '
                           'in the first line')
    optgrp.add_option('-C', '--columns', type='str', metavar='string',
                      dest='filters', action='MatchColumns',
                      help='SELECT specific log file columns, provided AS a '
                           'comma delimiter string counting FROM 1')
    parser.add_option_group(optgrp)

    optgrp = OptionGroup(parser, 'Mode flags',
                         'These flags affect the behavior of other options '
                         'specified to their right.  They can be used '
                         'as many times as needed.')
    optgrp.add_option('-c', '--case', type='choice', choices=['i', 'ignore', 'r', 'respect'],
                      metavar='i[gnore]|r[espect]',
                      action='callback',
                      callback=OptChecker.regexSetCaseSensitivity,
                      help=('ignore or respect the distinction between '
                            'upper and lower case letters in pattern and '
                            'string matching options after this'))
    parser.add_option_group(optgrp)

    optgrp = OptionGroup(parser, 'Final selection',
                         'Limit the output to a subsequence of the '
                         'qualifying log entries from each input file. '
                         'Use at most one of these options.')
    optgrp.add_option('-n', '--tail', type='int', metavar='N',
                      help=('select the last N qualifying log entries'))
    optgrp.add_option('-s', '--slice', type='int', metavar='I [J]',
                      action='optionalSecondArg',
                      help=('select qualifying log entries I <= i < J '
                            '(0 is first; <0 is relative to the end)'))
    parser.add_option_group(optgrp)

    optgrp = OptionGroup(parser, 'Input options')
    optgrp.add_option('-u', '--unzip', action='store_true',
                      help='read gzip-compressed input; assumed when inputfile suffix is ".gz"')
    parser.add_option_group(optgrp)

    optgrp = OptionGroup(parser, 'Output options')
    optgrp.add_option('-o', '--out', type='string', metavar='outputfile',
                      help='write output to specified file or directory (instead of stdout)')
    optgrp.add_option('-z', '--zip', type='choice', choices=list('0123456789'), metavar='0..9',
                      help=('compression level (gzip): 0 = no compression; '
                            '9 = maximum compression'))
    optgrp.add_option('-a', '--append', action='store_true',
                      help="when output file already exists, append to it; don't overwrite")
    parser.add_option_group(optgrp)

    optgrp = OptionGroup(parser, 'Message options')
    optgrp.add_option('-q', '--quiet', dest='verbose', action='store_false',
                      help='suppress status messages')
    optgrp.add_option('-h', '-?', '--help', action='help',
                      help='show this help message and exit')
    optgrp.add_option('--usage', action="briefhelp")
    parser.add_option_group(optgrp)

    parser.set_defaults(verbose=True, filters=[], slice=(None, None))

    # Parse the command line arguments
    (options, args) = parser.parse_args()

    return options, args


# -------------------------------------------------------------------------

def openInputFile(ifn, options):
    filesToClose = []
    unzip = options.unzip

    # Open input file, unless reading from stdin
    if ifn == '-':
        ifn = zname = 'stdin'
        fileIn = sys.stdin
    else:
        ifn = os.path.abspath(ifn)
        # In case a coordinator or segment instance's data directory name
        # was given, append '.log' to get the instance's log file name
        # (gpdb convention).
        if (os.path.isdir(ifn)):
            ifn += '.log'
        zname = os.path.split(ifn)[1]
        if ifn.endswith('.gz'):
            unzip = True
            zname = zname[0:-3]
            if os.path.splitext(zname)[1] == '':
                zname += '.log'
        fileIn = open(ifn, (unzip and 'rb') or 'r')
        filesToClose.append(fileIn)

    # Set up input decompression
    if unzip:
        fileIn = gzip.GzipFile(zname, 'rb', fileobj=fileIn)
        filesToClose.insert(0, fileIn)

    if zname.endswith('.csv'):
        fileIn = csv.reader(fileIn, delimiter=',', quotechar='"')
        fileIn = CsvFlatten(fileIn)

    return fileIn, filesToClose, ifn, zname


def openOutputFile(ifn, zname, options):
    filesToClose = []

    # append or overwrite?
    if options.append:
        omode = 'a'
    else:
        omode = 'w'

    # Compressed output file should be opened in binary mode
    zipout = options.zip and options.zip > '0'
    if zipout:
        omode += 'b'

    # Open output file in binary mode so as not to disturb the
    # original line ending control characters.
    else:
        omode += 'b'

    # Open output file, unless writing to stdout
    if options.out is None:
        fileOut = sys.stdout
    else:
        # let error messages show full path in case something goes wrong
        ofn = os.path.abspath(options.out)

        # if ofn refers to a directory, append name of input file
        if os.path.isdir(ofn):
            ofn = os.path.join(ofn, zname + ".out")

        # append .gz suffix if compressing
        if zipout and not ofn.endswith('.gz'):
            ofn += '.gz'

        # error if ofn is a directory now
        if os.path.isdir(ofn):
            raise IOError('cannot write output file because there is a '
                          'directory at the output location: %s' % ofn)

        # make sure we can write to the output directory
        odn = os.path.split(ofn)[0]
        if not os.access(odn, os.W_OK):
            raise IOError('output directory not found or not writable: %s' % odn)

        # open the file
        fileOut = open(ofn, omode)
        filesToClose.append(fileOut)

        if options.verbose:
            if options.append:
                print(' append to', ofn, file=sys.stderr)
            else:
                print(' output to', ofn, file=sys.stderr)

    # Set up output compression
    if zipout:
        fileOut = gzip.GzipFile(zname, omode,
                                compresslevel=int(options.zip),
                                fileobj=fileOut)
        filesToClose.insert(0, fileOut)

    return fileOut, filesToClose


# ------------------------------- Mainline --------------------------------

# Use default locale specified by LANG environment variable
try:
    locale.setlocale(locale.LC_ALL, '')
except Exception:
    pass

# Parse the command line arguments
options, args = parseargs()

# Determine timestamp range
begin, end = spiffInterval(options.begin, options.end, options.duration)
if begin:
    begin = begin.replace(microsecond=0)
if end:
    end = end.replace(microsecond=0)

# Insert trouble message filter ahead of other pattern matching filters
if options.trouble:
    options.filters.insert(0, filterize(MatchInFirstLine,
                                        (': |'.join(TROUBLE_VALUES)) + u': '))

# Limit output to last N entries if requested.  Let --tail override --slice.
if options.tail is None:
    sliceBegin, sliceEnd = options.slice
elif options.tail > 0:
    sliceBegin, sliceEnd = -options.tail, None
else:
    sliceBegin, sliceEnd = 0, 0

# Output suffix .gz implies maximum compression unless overridden by -z
if (options.zip is None and
        options.out and
        options.out.endswith('.gz')):
    options.zip = '9'

try:
    # If no inputfile arg, try COORDINATOR_DATA_DIRECTORY environment variable
    if len(args) == 0:
        s = get_coordinatordatadir()
        if s:
            # we only support log rotation in log dir.
            if os.path.exists(s + "/log"):
                for logfile in os.listdir(s + "/log"):
                    args.append(s + "/log/" + logfile)
            else:
                raise IOError('Specify input file or "-" for standard input')
        else:
            raise IOError('specify input file or "-" for standard input')

    # In MS Windows, apply shell wildcard expansion to input filename list
    if sys.platform == 'win32':
        import glob

        newargs = []
        for a in args:
            names = glob.glob(a)
            names.sort()
            newargs.extend(names)
        args = newargs

    inputFilesToClose = outputFilesToClose = []
    fileOut = None
    try:
        # Output to a directory?
        outputFilePerInputFile = False
        if options.out:
            options.out = os.path.abspath(options.out)
            if os.path.isdir(options.out):
                outputFilePerInputFile = True

        if options.verbose:
            msg = ('requested timestamp range from %s to %s'
                   % (begin or 'beginning of data', end or 'end of data'))
            print(msg, file=sys.stderr)

        # Loop over input files
        for ifn in args:
            """ 
            Open each file in the logs directory. Check to see if the file name
            looks anything like a log file name with a time stamp that we 
            recognize. If true, and the user specified a time range, skip the
            file if it is outside the range. That is, close the file and any
            associated temporary files.

            All other files with names that do not look like time stamps are
            processed. That is, their log information is extracted, and if 
            the user specified a time range, only those entries that are
            within that range are kept.
            """
            # Open next input file
            fileIn, inputFilesToClose, ifn, zname = openInputFile(ifn, options)

            # if we can skip the whole file, let's do so
            if zname.startswith('gpdb') and zname.endswith('.csv'):
                goodFormat = True
                try:
                    # try format YYYY-MM-DD_HHMMSS
                    filedate = datetime.strptime(zname[5:-4], '%Y-%m-%d_%H%M%S')
                except:
                    try:
                        # try format YYYY-MM-DD
                        filedate = datetime.strptime(zname[5:-4], '%Y-%m-%d')
                    except:
                        # the format isn't anything I understand
                        goodFormat = False

                if goodFormat and begin and filedate < begin:
                    if end and filedate > end:
                        print("SKIP file: %s" % zname, file=sys.stderr)
                        for f in inputFilesToClose:
                            f.close()
                        inputFilesToClose = []
                        continue

            # Announce each input file *before* its output file if --out is dir
            if options.verbose and outputFilePerInputFile:
                print('----------', ifn, '----------', file=sys.stderr)

            # Open the output file (once per input file if --out is a directory)
            if fileOut is None:
                fileOut, outputFilesToClose = openOutputFile(ifn, zname, options)

            # Announce input files *after* single output file
            if options.verbose and not outputFilePerInputFile:
                print('----------', ifn, '----------', file=sys.stderr)

            # Construct the filtering pipeline
            filteredInput = FilterLogEntries(fileIn,
                                             verbose=options.verbose,
                                             beginstamp=begin,
                                             endstamp=end,
                                             filters=options.filters,
                                             ibegin=sliceBegin,
                                             jend=sliceEnd)

            # Write filtered lines to output file.  Don't append \n to
            # each line, because the original line ends are still there.
            for line in filteredInput:
                print(line, file=fileOut)

            # Close input and output files
            for file in inputFilesToClose:
                file.close()
            inputFilesToClose = []

            if outputFilePerInputFile:
                fileOut = None
                for file in outputFilesToClose:
                    file.close()
                outputFilesToClose = []

    finally:
        for file in outputFilesToClose:
            file.close()
        for file in inputFilesToClose:
            file.close()
except IOError as msg:
    execname = os.path.basename(sys.argv[0])
    print('%s: (IOError) "%s"' % (execname, msg), file=sys.stderr)
    sys.exit(2)
