#!/usr/bin/python
# -*- coding: utf-8 -*-
# above encoding is for non-ascii in source comments

"""
History:
abb 5/18/2012: crashing bc I didn't check for unicode in ExcessData, fixed, version 1.1.
abb 5/19/2012: epid format was %20.20s, should be %21.21s, fixed, version 1.1.
abb 5/24/2016: IL parsing errors bug fix, version 1.2
abb 12/7/2016: TN bug from added fields in afcarslinker1p2_data.py, version 1.2
abb 6/20/2017: IL bug, allow for lines short by 1 character
abb 11/7/2023: add remdate & plcdate to sort

"""

# abb 6/28/2012: separated the data & file handling stuff to this module.
import os, sys

#===========================================================
# Data & file handling objects:

def updatefunc( prefix, num, unit, suffix, den=0 ):
    msg = prefix+'Processed '+str(num)
    if den:
        msg += ' of '+str(den)
    msg += ' '+unit+'s'
    if 'app' in globals() and app.TopWindow:
        app.TopWindow.SetStatusText( msg )
        app.TopWindow.Refresh()
        wx.SafeYield()
    else:
        print >> sys.stderr, msg

class RecordClass:
    MostRecentEpisode = 0
    ReenteringDischarge = 1
    Duplicate = 2
    MissingDischarge = 3
    recordlabels = ( 'Most Recent Episode', 'Reentering Discharge', 'Duplicate Record', 'Missing Discharge Date' )
    recordcolors = ( '#80ffb0', '#ff8000', '#a0a0a0', '#ff5050' )

class FileStats(dict):
    colnames = list(RecordClass.recordlabels) + ['Total','Links To Prior Submissions']
    iftotal = len(RecordClass.recordlabels)  # additional count I'll add to filestats
    ilinks2prior = iftotal+1  # additional count I'll add to filestats

    def ToPct( self ):
        fs2 = dict()
        for fn in self.keys():
            fs2[fn] = ['']*len(self[fn])
            for col in range(0, len(fs2[fn])):
                fs2[fn][col] = int(round(100*self[fn][col] / self[fn][FileStats.iftotal]))
        return( fs2 )

# Finds this executable's path, even when frozen in py2exe/cx_freeze.
class ScriptPath():
    d = ''

    def __init__(self):
        # http://www.py2exe.org/index.cgi/WhereAmI
        import sys
        ep = sys.executable if hasattr(sys, "frozen") else sys.argv[0]
        self.d = os.path.dirname(unicode(ep, sys.getfilesystemencoding( )))

# Provide fixed-width file formats from config file.  Provide method to guess file format.
class FWFFormat():
    fwf = None
    errmsg = None

    def __init__( self, filename ):
        import os
        if not os.path.isfile( filename ):
            self.errmsg = 'WARNING: fixed-width-file configuration file: '+filename+' not found.'
            return( None )

        # Read the fixed-width-format file specifications from a config file.
        import ConfigParser
        import sys
        config = ConfigParser.SafeConfigParser()
        try:
            config.read( filename )
            print >> sys.stderr, 'Fixed-width-file configuration read from', filename, '.'
            self.errmsg = None
        except:
            self.errmsg = 'WARNING: failed to open fixed-width-file configuration file: '+filename+'. Exception = '+str(sys.exc_info()[1])
            return( None )

        fwf = dict()  # the fixed-width-format dict
        for sec in config.sections():
            fwf[sec] = dict()
            for opt in config.options(sec):
                fwf[sec][opt] = config.get( sec, opt )
                #print >> sys.stderr, sec, ';', opt, ';', fwf[sec][opt], '.'
                if fwf[sec][opt] and fwf[sec][opt].find('|') > 0:  # if there's a '|' anywhere, make it into a list
                    fwf[sec][opt] = fwf[sec][opt].split('|')

        self.fwf = fwf

        # Calculated values for fwf specs:
        import struct
        for ft in fwf:
            # Convert the fwf fieldwidths into format strings for struct.unpack_from.
            if 'fieldwidths' in fwf[ft]:
                # convert widths to int
                fwf[ft]['fieldwidths'] = map( int, fwf[ft]['fieldwidths'] )
                # then fill in the format string for the struct fixed-width parser
                fwf[ft]['fmtstring'] = '%ds'*len(fwf[ft]['fieldwidths']) % tuple(fwf[ft]['fieldwidths'])
                fwf[ft]['reclen'] = struct.calcsize( fwf[ft]['fmtstring'] )
                fwf[ft]['Parser'] = struct.Struct( fwf[ft]['fmtstring'] ).unpack_from

    def CWFiletype( self, cfile ):
        import re
        # Child Welfare file identifier method.
        # Files should be referenced by filename & filetype, since a single file may contain
        # multiple file types (AFCARS contains foster care (@) and adoption ($) files
        filedelimiter = None
        datastartpos = maxlinelen = iline = 0
        print >> sys.stderr, 'CWFiletype: checking', cfile.name, '...'
        fpos = 0  # abb 2/28/2013: cfile.tell() reports end of 8K buffer, not where we want!
        for line in cfile:
            iline += 1
            fpos += len(line)
            if not filedelimiter and iline <= 3:
                # test for file delimiter at beginning of line
                for ft in self.fwf:
                    if 'filedelimiter' in self.fwf[ft] and re.match( self.fwf[ft]['filedelimiter'], line ):
                        # found a file delimiter. that + the line length will determine filetype.
                        filedelimiter = self.fwf[ft]['filedelimiter']
                        line = cfile.next() # skip summary line (specific to child welfare files)
                        iline += 1
                        fpos += len(line)
                        datastartpos = fpos
                        break
            # look for lines that start with a 2-digit state code
            elif iline <= 100 and re.match('^ ?\d', line):
                # strip /r/n from line to get proper length
                maxlinelen = max( maxlinelen, len(line.rstrip('\r\n')) )
            else:
                break

        print >> sys.stderr, 'CWFiletype: filedelimiter, datastartpos=', filedelimiter, datastartpos, '.'
        cfile.seek( datastartpos )  # rewind, hope that's always possible?

        filetype = None
        # find closest fwf format with this filedelimiter (if we found one) and reclen <= maxlinelen
        dx = ll = maxlinelen
        for ft in self.fwf:
            if 'reclen' in self.fwf[ft]:
                dx2 = ll - self.fwf[ft]['reclen']
                #print >> sys.stderr, ll, ft, self.fwf[ft]['reclen'], dx, dx2, filetype
                # abb 6/20/2017: allow for lines short by 1 character
                if dx2 >= -1 and dx2 < dx and (not filedelimiter or
                        ('filedelimiter' in self.fwf[ft] and self.fwf[ft]['filedelimiter']==filedelimiter)):
                    dx = dx2  # closest length so far, 
                    filetype = ft  # best filetype candidate so far.

        return( filedelimiter, filetype )


"""
Track a 'removaldate+dob' identifier with this object.  Use to look for infrequent removaldate+dob to
diagnose inter-file linking errors.
removaldate+dob has more unique combos than fips+dob, probably because weekend removals are rare.
ex: 19941222, 20020511
"""
class IDTracker():
    epid = rdt = count = None

    def __init__(self):
        self.epid = dict()
        self.rdt = dict()
        self.count = dict()

    def add( self, epid, rdt, dob ):
        if rdt in self.rdt:
            self.rdt[rdt] += 1
        else:
            self.rdt[rdt] = 1

        uid = rdt+':'+dob
        if uid in self.count:
            self.count[uid] += 1
            self.epid[uid].append( epid )
        else:
            self.count[uid] = 1
            self.epid[uid] = [epid,]

    def sort( self ):
        # Sort the epids (& counts) from least to most frequent.
        # Then from least to most frequent removal date+dob:
        def sortkey( uid ):
            return( self.rdt[uid[0:uid.rindex(':')]] + (self.count[uid]/1000.0) )
        ks = sorted( self.count, key=sortkey, reverse=False )
        self.epid = dict( (k, self.epid[k]) for k in ks )


class AFCARSFCDatafile():
    # Hard-coded AFCARS 2003 file info for fallback, in case we don't find a FWF config file:
    fieldnames0 = fieldnames = ('State','RptEndDate','AgencyFIPS','RecNum','MostRecentReview','DOB','Sex','Race','Ethnicity','Disability','MentalRetardation','VisualHearingImpaired','PhysicallyDisabled','EmotionallyDisturbed','OtherMedicalConditions','EverBeenAdopted','HowOldWhenAdopted','FirstRemovalFromHome','TotalRemovals','LastFCDischarge','LatestRemovalfromHome','RemovalTransactionDate','PlcCurrentFC','PlcsThisRemoval','MannerThisRemoval','RRPhysical','RRSexual','RRNeglect','RRAlcoholParent','RRDrugsParent','RRAlcoholChild','RRDrugsChild','RRChildDisability','RRChildBehavior','RRParentDeath','RRParentIncarceration','RRCaretakerCantCope','RRAbandonment','RRRelinquishment','RRInadequateHousing','CurrentPlacementType','OutofStatePlacement','CasePlanGoal','CaretakerFamilyStructure','YOBCaretaker1','YOBCaretaker2','MotherTPRDate','FatherTPRDate','FCFamilyStructure','YOBFosterParent1','YOBFosterParent2','RaceFosterParent1','EthnicityFosterParent1','RaceFosterParent2','EthnicityFosterParent2','FCDischargeDate','FCDischargeTransactionDate','DischargeReason','EligIVEFC','EligIVEAd','EligIVA','EligIVDChildSupport','EligXIXMedicaid','EligSSI','EligNone','MonthlyFCPayment','ExtraField')
    fieldnums = dict(zip( fieldnames, range(0, len(fieldnames)) ))
    fieldwidths = (2,6,5,12,8,8,1,6,1,1,1,1,1,1,1,1,1,8,2,8,8,8,8,2,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,4,4,8,8,1,4,4,6,1,6,1,8,8,1,1,1,1,1,1,1,1,5)
    idfield = fieldnums['RecNum']

    data0 = []
    rownames = []
    epids = []
    rowclasses = filestats = rdtdob = None
    filenames = []
    subdtmax = -1

    def __init__(self):
        cp = os.path.join( ScriptPath().d, 'cwfwf.cfg' )
        self.fileformatter = FWFFormat( cp )
        if not self.fileformatter or not self.fileformatter.fwf:
            self.fileformatter = None
            print >> sys.stderr, 'Failed to open AFCARS file format config file cwfwf.cfg'

        import struct
        fmtstring = '%ds'*len(self.fieldwidths) % self.fieldwidths
        self.Parser = struct.Struct(fmtstring).unpack_from
        self.reclen = struct.calcsize(fmtstring)
        #print >> sys.stderr, 'fmtstring: %r, record size: %r chars' % (fmtstring, struct.calcsize(fmtstring))

    def ReadFile( self, filename, updatefunc=None ):
        import zipfile
        if zipfile.is_zipfile(filename):
            print >> sys.stderr, 'ReadFile: Opening zipfile ', filename, '...'
            zf = zipfile.ZipFile( filename, 'r' )
            for fn in zf.namelist():
                print >> sys.stderr, 'ReadFile: Reading ', fn, '...'
                txtfile = zf.open( fn, 'r' )
                if self._ReadFWFFile( txtfile, os.path.basename(fn), updatefunc ):
                    self.filenames.append( fn )
            zf.close()
        else:
            print >> sys.stderr, 'ReadFile: Reading ', filename, '...'
            fn = os.path.basename( filename )
            if fn.lower().endswith('.bz') or fn.lower().endswith('.bz2'):
                import bz2
                print >> sys.stderr, 'ReadFile: Opening bzip2 file ', filename, '...'
                txtfile = bz2.BZ2File( filename )
                fn = fn[0:fn.rindex('.')]
            elif fn.lower().endswith('.gz'):
                import gzip
                print >> sys.stderr, 'ReadFile: Opening gzip file ', filename, '...'
                txtfile = gzip.GzipFile( filename )
                fn = fn[0:fn.rindex('.')]
            elif fn.lower().endswith('.xz'):
                import lzma # https://github.com/peterjc/backports.lzma
                print >> sys.stderr, 'ReadFile: Opening xz file ', filename, '...'
                txtfile = lzma.LZMAFile( filename )
                fn = fn[0:fn.rindex('.')]
            else:
                txtfile = open( filename, 'rb' )

            nrecs = 0
            if fn.lower().endswith('.csv'):
                nrecs = self._ReadCSVFile( txtfile, fn, updatefunc )
            else:
                nrecs = self._ReadFWFFile( txtfile, fn, updatefunc )

            if nrecs:
                self.filenames.append( fn )
            txtfile.close()

        print >> sys.stderr, 'ReadFile: ', filename, ', reclen =', self.reclen, ', nfields =', len(self.fieldnames), ', total records =', len(self.data0), ', latest submission =', self.subdtmax


    def _ReadFWFFile( self, txtfile, fn, updatefunc=None ):
        import string, re

        ft = None
        if self.fileformatter:
            (fd, ft) = self.fileformatter.CWFiletype( txtfile )
            print >> sys.stderr, 'ReadFWFFile:', txtfile.name, 'filedelim =', fd, 'filetype =', ft
            if ft and ft.find('AFCARSFC') >= 0:
                self.fieldnames = self.fileformatter.fwf[ft]['fieldnames'] + ['ExtraField',]
                self.fieldwidths = self.fileformatter.fwf[ft]['fieldwidths']
                self.reclen = self.fileformatter.fwf[ft]['reclen']
                self.Parser = self.fileformatter.fwf[ft]['Parser']
        if not ft:
            print >> sys.stderr, 'ReadFWFFile: falling back to AFCARSFC2000 file format.'

        updateincr = 5000 if 'app' in globals() and app.TopWindow else 100000  # better via timer in updatefunc?
        validpat = re.compile( '^ ?\d' )
        nlines = nrecs = 0
        for iline in txtfile:
            #print >> sys.stderr, iline
            nlines += 1
            iline = iline.rstrip('\r\n')
            #print >> sys.stderr, len(iline), self.reclen, '.'+iline[0:2]
            # need some slop in reclen for NDACAN files? last field is 5 chars
            if len(iline) >= (self.reclen-4) and re.match(validpat, iline):
                if len(iline) < self.reclen:
                    iline = iline.ljust( self.reclen, ' ' ) # space-pad to reclen for parser
                prefix = fn + ':' + str(nlines)
                fields = list(self.Parser(iline))
                # if parser didn't consume the whole line, then add the rest to the record as ExtraField.
                if len(iline) > self.reclen:
                    fields = fields + [iline[self.reclen:],]
                else:
                    fields = fields + ['',]
                # show non-ascii recnums in 12 chars? vim shows this as "®©¬®½ºÿúüþúø",
                # winxp as "®©¬®½ºÿúüþúø", same in web browser as ISO-8859-1.
                # decoding to 'raw_unicode_escape' ensures same screen representation on Linux & Win:
                fields[self.idfield] = fields[self.idfield].decode('raw_unicode_escape')
                try:
                    #i = int(fields[self.fieldnums['RptEndDate']])
                    i = int(self.ParseDate( fields[self.fieldnums['RptEndDate']] ))
                    if i > self.subdtmax:
                        self.subdtmax = i
                except:
                    print >> sys.stderr, 'ERROR: bad report end date in ', filename, nlines
                    print >> sys.stderr, '.'+iline+'.'
                self.data0.append( fields )
                self.rownames.append( prefix )
                nrecs += 1

            if updatefunc and (nlines % updateincr) == 0 and nlines > 0:
                updatefunc( fn+': ', nlines, 'line', '' )

        #print >> sys.stderr, 'ReadFWFFile:', nlines, nrecs
        return( nrecs )

    def _ReadCSVFile( self, txtfile, fn, updatefunc=None ):
        import string, re, csv
        updateincr = 5000 if 'app' in globals() and app.TopWindow else 100000  # put timer in updatefunc instead?
        unicodefields = [self.idfield,]
        validpat = re.compile( '^ ?\d' )
        nlines = nrecs = 0
        #for fields in csv.reader(txtfile): 
        # abb 5/15/2017: workaround nulls, http://stackoverflow.com/questions/4166070/python-csv-error-line-contains-null-byte
        for fields in csv.reader((line.replace('\0','') for line in txtfile), delimiter=','):
            nlines += 1
            if re.match(validpat, fields[0]):
                prefix = fn + ':' + str(nlines)
                #python slicing inadequate, see R: fields[unicodefields] = fields[unicodefields].decode('raw_unicode_escape')
                for i in unicodefields:
                    fields[i] = fields[i].decode('raw_unicode_escape')

                try:
                    #i = int(fields[self.fieldnums['RptEndDate']])
                    i = int(self.ParseDate( fields[self.fieldnums['RptEndDate']] ))
                    if i > self.subdtmax:
                        self.subdtmax = i
                except:
                    print >> sys.stderr, 'ERROR: bad report end date in ', txtfile, nlines
                    print >> sys.stderr, '.', fields, '.'
                self.data0.append( fields )
                self.rownames.append( prefix )
                nrecs += 1
            # abb 11/5/2015: update the fieldnames with each file read, use the longest.
            elif nlines == 1 and ( self.fieldnames == self.fieldnames0 or len(fields) > len(self.fieldnames) ):
                # assume we want to use the fieldnames from the csv file instead of the standards?
                # abb 12/7/2016: bug fix. append columns to previous records. field regex impractical.
                if( len(fields) > len(self.fieldnames) ):
                    for icol in range(len(self.fieldnames), len(fields)):
                        print >> sys.stderr, 'ReadCSVFile:', txtfile.name, 'adds column:', fields[icol]
                    i = [''] * (len(fields) - len(self.fieldnames))
                    for irow in range(0, len(self.data0)):
                        self.data0[irow].extend(i)
                self.fieldnames = fields
                if 'EpisodeID' in fields and ( not 'EpisodeID' in unicodefields ):
                    unicodefields.append( fields.index('EpisodeID') )
                if 'ExcessData' in fields and ( not 'ExcessData' in unicodefields ):
                    unicodefields.append( fields.index('ExcessData') )

            if updatefunc and (nlines % updateincr) == 0 and nlines > 0:
                updatefunc( fn+': ', nlines, 'line', '' )

        return( nrecs )

    def SaveFile( self, filename, updatefunc=None ):
        import sys
        fn = os.path.basename( filename )
        txtfile = sys.stdout if filename == 'stdout' else open( filename, 'wb' )
        print >> sys.stderr, 'SaveFile: Saving', filename, '...'
        self._SaveFile( txtfile, fn, updatefunc )
        if fn != 'stdout':
            txtfile.close()

    def _SaveFile( self, txtfile, fn, updatefunc=None ):
        import sys, string
        # sys.stdout.encoding == None when stdout is a pipe.  Need it to be UTF-8?
        # Originals were not unicode, they were raw_unicode_escape, so keep that way then no diffs.
        updateincr = 5000 if 'app' in globals() and app.TopWindow else 100000

        fieldnames = list(self.fieldnames)  # operate on copy
        # add epid & record classification to fields.
        if( self.rowclasses and self.epids ):
            fieldnames += ['EpisodeID','RecordType']
            (epfmt, rtfmt) = ('%0.21s', '%1.1s') # do I want EpisodeID left space-padded (%21.21s)?

        if fn.lower().endswith('.csv') or fn == 'stdout':
            import csv
            csvfile = csv.writer( txtfile )
            csvfile.writerow( fieldnames )
        else:
            csvfile = None
            # Write AFCARS foster care file delimiter, @, and blank summary record.
            txtfile.write( '@\n\n' )

        unicodefields = [self.idfield,]
        # list.index() should return list: unicodefields += fieldnames.index('EpisodeID')
        for i in range(0, len(fieldnames)):
            if fieldnames[i] in ('EpisodeID','ExcessData'): unicodefields.append(i)
        #print >> sys.stderr, 'SaveFile: unicodefields=', unicodefields, '.'

        datefields = [self.fieldnums[k] for k in ('RptEndDate','MostRecentReview','DOB','FirstRemovalFromHome','LastFCDischarge','LatestRemovalfromHome','RemovalTransactionDate','PlcCurrentFC','MotherTPRDate','FatherTPRDate','FCDischargeDate','FCDischargeTransactionDate')]

        nrecs = 0
        for irow in range(0, len(self.data0)):
            # Short-circuit to output all rows.
            if True or (self.rowclasses and self.rowclasses[irow] != RecordClass.Duplicate):
                fields = list(self.data0[irow])  # operate on a copy, not pointer.
                if( self.rowclasses and self.epids ):
                    # add epid & record classification to fields.
                    fields += [epfmt % self.epids[irow], rtfmt % self.rowclasses[irow]]
                # NDACAN records not being padded out? no, lzma xz decomp error FY2015 rec 87106/irow==4157
                # re-encode non-ascii fields
                for i in unicodefields:
                    fields[i] = fields[i].encode('raw_unicode_escape')
                # Fix date field formats here.
                for i in datefields:
                    fields[i] = self.ParseDate( fields[i] )
                #if irow==0:
                #    print >> sys.stderr, 'SaveFile: fields=', fields, '.'
                #print >> sys.stderr, irow
                if csvfile:
                    csvfile.writerow( fields )
                else:
                    txtfile.write( ''.join(fields)+'\n' )
                nrecs += 1
            if updatefunc and (irow % updateincr) == 0 and irow > 0:
                updatefunc( fn+': ', irow, 'line', '' )

        return( nrecs )

    def Sort( self ):
        o1 = range( 0, len(self.data0) )
        # standardize the order of records: sort by recnum, then subdt
        # abb 11/7/2023: then remdate, then plcdate
        # stripping recnum to handle NDACAN mistakes, could cause trouble?
        def sortkey( row ):
            rn = self.data0[row][self.idfield].lstrip('0').strip()
            rs = self.data0[row][self.fieldnums['State']].lstrip('0').strip() # added 3/15/2017
            rd = self.ParseDate( self.data0[row][self.fieldnums['RptEndDate']] )
            rd2 = self.ParseDate( self.data0[row][self.fieldnums['LatestRemovalfromHome']] ) # added 11/7/2023
            rd3 = self.ParseDate( self.data0[row][self.fieldnums['PlcCurrentFC']] ) # added 11/7/2023
            #if rn=='202659970':
            #    print >> sys.stderr, 'date checks: ', rn+'|'+rs+'|'+rd+'|'+rd2+'|'+rd3, self.data0[row][self.fieldnums['CurrentPlacementType']], '.'
            return( rn+'|'+rs+'|'+rd+'|'+rd2+'|'+rd3 )
        o2 = sorted( o1, key=sortkey, reverse=True )
        d2 = map( lambda i: self.data0[o2[i]], o1 )
        self.data0 = d2
        # Unwieldy, bug-prone keeping rownames in sync with data0.
        d2 = map( lambda i: self.rownames[o2[i]], o1 )
        self.rownames = d2
        return( self.data0 )

    def ParseDate( self, dt ):
        # NDACAN '20079 18' should be 20070918 ... forced to parse the date.
        # ParseDate(0,'20070918')  ParseDate(0,'20079 18')  ParseDate(0,'07-9-18')
        dt = dt.replace('-', ' ').replace('/', ' ').replace('.', ' ')
        dt1 = dt.split(' ')
        try:
            if len(dt1) == 3:
                # This assumes the TN data format: 'yy-mm-dd'.  To guess format, I'd need to
                # examine all the dates for valid yy, mm, & dds ...
                yy = int(dt1[0])
                if yy < 1000:
                    if yy < 30:
                        yy = 2000+yy
                    else:
                        yy = 1900+yy
                    dt = '%04d%02d%02d' % ( yy, int(dt1[1]), int(dt1[2]) )
            else:
                # check for 6-char AFCARS submission months, e.g. '20039 ', '2003 9'
                if len(dt) == 6:
                    dt = '%04d%02d' % ( int(dt[0:4]), int(dt[4:6]) )
                else:
                    dt = '%04d%02d%02d' % ( int(dt[0:4]), int(dt[4:6]), int(dt[6:8]) )
        except:
            pass
        return( dt )

    def ClassifyRows( self ):
        # more elegant way to store metadata?
        # use case: grid.data.getattr( row, col ) needs rowclass[row] in (dup, reentry)
        rc = [None] * len(self.data0)
        self.epids = [None] * len(self.data0)
        mostrecent = dict()
        self.rdtdob = dict()

        filestats = FileStats()
        for fn in self.filenames:
            filestats[fn] = [0]*(1+FileStats.ilinks2prior)
            self.rdtdob[fn] = IDTracker()

        if len(self.data0) > 0:
            # going from 0 to len here is moving from latest record to oldest, backward in time.
            for row in range(0, len(self.data0)):
                rdt2 = self.ParseDate( self.data0[row][self.fieldnums['LatestRemovalfromHome']] )
                rn2 = self.data0[row][self.idfield].lstrip('0').strip()  # record number
                rst2 = self.data0[row][self.fieldnums['State']].lstrip('0').strip() # added 3/15/2017
                rn2 = rn2+':'+rst2
                epid = rn2+':'+rdt2
                # if we've seen this epid already, it's a duplicate.
                if epid in mostrecent:
                    rc[row] = RecordClass.Duplicate  # duplicate record
                else:
                    # haven't seen this epid already, but could still be a dup or something else.
                    rn1 = self.data0[row-1][self.idfield].lstrip('0').strip()  # record number
                    rst1 = self.data0[row-1][self.fieldnums['State']].lstrip('0').strip() # added 3/15/2017
                    rn1 = rn1+':'+rst1
                    # equal record numbers will identify this as the same child
                    if row > 0 and rn2 == rn1:
                        # same record number as the previous record, so it's either a reentry or a removal date correction.
                        # rule: dup if newer remdate == earlier remdate.
                        rdt1 = self.ParseDate( self.data0[row-1][self.fieldnums['LatestRemovalfromHome']] )
                        if rdt1 == rdt2:
                            # so this epid == previous epid, yet the epid was not in the mostrecent list?
                            # this can happen if the previous record was a removal date correction.
                            rc[row] = RecordClass.Duplicate  # duplicate record
                            epid = self.epids[row-1]  # in case previous was a removal date correction
                        else:
                            # same record number as previous, but different removal date.
                            # rule: it's a correction if newer nrem == earlier nrem.
                            l1 = self.data0[row][self.fieldnums['TotalRemovals']] == self.data0[row-1][self.fieldnums['TotalRemovals']]
                            # rule: it's a correction if newer missing lastdisdate or == earlier lastdisdate.
                            ldt1 = self.ParseDate( self.data0[row-1][self.fieldnums['LastFCDischarge']] ).strip()
                            ldt2 = self.ParseDate( self.data0[row][self.fieldnums['LastFCDischarge']] ).strip()
                            l2 = len(ldt1) != 8 or ldt1 == ldt2
                            # rule: it's not a correction if newer has firstrem and == earlier remdate.
                            fdt1 = self.ParseDate( self.data0[row-1][self.fieldnums['FirstRemovalFromHome']] ).strip()
                            l3 = len(fdt1) == 8 and fdt1 == rdt2
                            # rule: it's not a correction if newer has lastdisdate and == earlier disdate
                            ddt2 = self.ParseDate( self.data0[row][self.fieldnums['FCDischargeDate']] ).strip()
                            l4 = len(ldt1) == 8 and ldt1 == ddt2

                            if l1 and l2 and not (l3 or l4):
                                #print >> sys.stderr, 'ClassifyRows: remdate correction?:', epid
                                rc[row] = RecordClass.Duplicate  # duplicate (corrected) record
                                epid = self.epids[row-1]  # epid incorrect, set to newer episode id
                            else:
                                # NOTE: reentry missing discharge also an "undis from prior" ... need multi-class
                                rc[row] = RecordClass.ReenteringDischarge  # reentry record
                                mostrecent[epid] = row
                    # new record number we haven't seen yet.
                    elif int(self.ParseDate(self.data0[row][self.fieldnums['RptEndDate']])) != self.subdtmax and self.data0[row][self.fieldnums['FCDischargeDate']].strip() == '':
                        # if it's from a submission earlier than the most recent & it's missing the discharge date,
                        # then categorize record as undischarged from prior submission.
                        rc[row] = RecordClass.MissingDischarge  # undischarged from prior submission
                        mostrecent[epid] = row
                    else:
                        # otherwise, it's the most recent episode for the child with this record number.
                        rc[row] = RecordClass.MostRecentEpisode  # most recent episode record
                        mostrecent[epid] = row

                self.epids[row] = epid  # record these to save with record later

                fn = self.rownames[row][0:self.rownames[row].rindex(':')]
                filestats[fn][rc[row]] += 1
                filestats[fn][FileStats.iftotal] += 1

                dob = self.ParseDate( self.data0[row][self.fieldnums['DOB']] )
                self.rdtdob[fn].add( epid, rdt2, dob )

            # to get links to priors, must reverse loop direction.
            ids = dict()
            for row in reversed(range(0, len(self.data0))):
                rn0 = self.data0[row][self.idfield].lstrip('0').strip()  # record number
                rst0 = self.data0[row][self.fieldnums['State']].lstrip('0').strip() # added 3/15/2017
                id0 = rn0+':'+rst0
                if id0 in ids:
                    fn = self.rownames[row][0:self.rownames[row].rindex(':')]
                    filestats[fn][FileStats.ilinks2prior] += 1
                else:
                    ids[id0] = row  # record number we haven't seen yet

        self.filestats = filestats
        self.rowclasses = rc
        return( rc )

    def GetNumberRows( self ):
        return( len(self.data0) )

    def GetRowNames( self ):
        #return( self.data0[0:len(self.data0)][0] )  # No Multi-D slice in python
        return( self.rownames )

    def GetFieldWidths( self ):
        #return( (self.filefieldwidth,) + self.fieldwidths )
        return( self.fieldwidths )

    def GetFieldNames( self ):
        #return( self.fieldnames + tuple(('ExtraFields'*(len(self.data0[0])-len(self.fieldnames))).split()) )
        return( self.fieldnames )

    def GetFilenames( self ):
        return( self.filenames )

    def GetFileStats( self ):
        return( self.filestats )

    def RareRecnumPats( self, fn0, npats ):
        # sort rdtdob[fn0] from least to most frequent
        self.rdtdob[fn0].sort()
        pats = []
        for id0 in self.rdtdob[fn0].epid.keys():
            # look for infrequent ids in other files
            for fn in sorted(self.filenames):
                if fn != fn0 and id0 in self.rdtdob[fn].epid:
                    rn0 = self.rdtdob[fn0].epid[id0][0][0:self.rdtdob[fn0].epid[id0][0].rindex(':')]
                    rn1 = self.rdtdob[fn].epid[id0][0][0:self.rdtdob[fn].epid[id0][0].rindex(':')]
                    if rn0 != rn1:
                        pat = rn0+'|'+rn1
                        if not pat in pats:
                            pats.append( pat )
                            if len(pats) >= npats: break
            if len(pats) >= npats: break

        print >> sys.stderr, 'RareRecnumPats:', pats
        return( pats )


