diff --git a/net/irc/irssilogparse.py b/net/irc/irssilogparse.py index 7a02e5e..88f8eff 100755 --- a/net/irc/irssilogparse.py +++ b/net/irc/irssilogparse.py @@ -1,179 +1,275 @@ #!/usr/bin/env python3 +## REFERENCE ## +# https://github.com/myano/jenni/wiki/IRC-String-Formatting +# https://www.mirc.com/colors.html +# https://en.wikipedia.org/wiki/ANSI_escape_code +# https://github.com/shabble/irssi-docs/wiki/Formats#Colourising-Text-in-IRC-Messages + + import argparse +import curses import os +import pprint import re +import sys try: import magic has_magic = True except ImportError: - print(('Warning: you do not have the magic module installed ' + - '(you can install it via "pip3 install --user python-magic"). ' + - 'Automatic log decompression will not work.')) + print('Warning: you do not have the magic module installed (you can ' + 'install it via "pip3 install --user file-magic"). Automatic log ' + 'decompression will not work.') has_magic = False -class logParser(object): +# This is a map to determine which module to use to decompress, +# if we should. +cmprsn_map = {'text/plain': None, # Plain ol' text + # Sometimes the formatting with color gives this + 'application/octet-stream': None, + 'application/x-bzip2': 'bz2', # Bzip2 + 'application/x-gzip': 'gzip', # Gzip + 'application/x-xz': 'lzma'} # XZ + +# irssi/mIRC to ANSI +# Split into 3 maps (truecolor will be populated later, currently uses 8-bit): +# - 8 (3/4 bit color values, 8 colors) +# - 256 (8-bit, 256 colors) +# - 'truecolor' (24-bit, ISO-8613-3, 16777216 colors) +# Keys are the mIRC color value +# Values are: +# - 8: tuple for ANSI fg and bg values +# - 256: single value (same number is used for fg and bg) +# - 'truecolor': tuple of (R#, G#, B#) (same number is used for fg and bg) +# In addition, all three have the following: +# - ansi_wrap: the string formatter. +# fg: foreground color +# bg: background color (if present) +# They are concatted together in that order. + ## https://en.wikipedia.org/wiki/ANSI_escape_code#3/4_bit +colormap = {8: {'0': ('97', '107'), + '1': ('30', '40'), + '2': ('34', '44'), + '3': ('32', '42'), + '4': ('91', '101'), + '5': ('31', '41'), + '6': ('35', '45'), + '7': ('33', '43'), + '8': ('93', '103'), + '9': ('92', '102'), + '10': ('36', '46'), + '11': ('96', '106'), + '12': ('94', '104'), + '13': ('95', '105'), + '14': ('90', '100'), + '15': ('37', '47'), + 'ansi_wrap': {'fg': '\x1b[{0}', + 'bg': ';{0}m'}}, + ## https://en.wikipedia.org/wiki/ANSI_escape_code#8-bit + 256: {'0': '15', + '1': '0', + '2': '19', + '3': '34', + '4': '196', + '5': '52', + '6': '90', + '7': '208', + '8': '226', + '9': '82', + '10': '37', + '11': '51', + '12': '21', + '13': '199', + '14': '241', + '15': '252', + 'ansi_wrap': {'fg': '\x1b[38;5;{0}m', + 'bg': '\x1b[48;5;{0}m'}}, + ## https://en.wikipedia.org/wiki/ANSI_escape_code#24-bit + # (can just use mIRC's R,G,B) + 'truecolor': {'0': ('255', '255', '255'), + '1': ('0', '0', '0'), + '2': ('0', '0', '127'), + '3': ('0', '147', '0'), + '4': ('255', '0', '0'), + '5': ('127', '0', '0'), + '6': ('156', '0', '156'), + '7': ('252', '127', '0'), + '8': ('255', '255', '0'), + '9': ('0', '252', '0'), + '10': ('0', '147', '147'), + '11': ('0', '255', '255'), + '12': ('0', '0', '252'), + '13': ('255', '0', '255'), + '14': ('127', '127', '127'), + '15': ('210', '210', '210'), + 'ansi_wrap': {'fg': '\x1b[38;2;{0[0]},{0[1]},{0[2]}', + 'bg': '\x1b[48;2;{0[0]},' + '{0[1]},{0[2]}'}}} + +def get_palette(): + # Return 8, 256, or 'truecolor' + colorterm = os.getenv('COLORTERM', None) + if colorterm == 'truecolor': + # TODO: 24-bit support (16777216 colors) instead of 8-bit. + # See note above. + #return('truecolor') + return(256) + else: + curses.initscr() + curses.start_color() + c = curses.COLORS + curses.endwin() + return(c) + +def color_converter(data_in, palette_map): + # Only used if logParser().args['color'] = True + # Convert mIRC/Irssi color coding to ANSI color codes. + color_ptrn = re.compile('\x03[0-9]{1,2}(,[0-9]{1,2})?') + _colors = colormap[palette_map] + # the value to reset formatting to the terminal default + reset_char = '\x1b[0m' + # the value to reset the foreground text + def_fg = '\x1b[39m' + # the value to reset the background text + def_bg = '\x1b[49m' + regex = {'nick': re.compile('\x04[89]/'), + 'bold': re.compile('\x02'), + ## Doublecheck this. what is the significance of \x04(.*) chars? + 'reset': re.compile('\x04(g|c|[389;]/?|e|>)?/?'), + 'color_clear': re.compile('\x0f(g|c|[389;]/?|e)?/?')} + # A sub-function that generates the replacement characters. + def _repl(ch_in): + _ch = ch_in.group().lstrip('\x03') + _chars = [i.strip() for i in _ch.split(',', 1)] + if len(_chars) == 1: + ch_out = _colors['ansi_wrap']['fg'].format(_chars[0]) + elif len(_chars) == 2: + ch_out = (_colors['ansi_wrap']['fg'].format(_chars[0]) + + _colors['ansi_wrap']['bg'].format(_chars[1])) + else: + raise RuntimeError('Parsing error! "{0}"'.format(ch_in)) + return(ch_out) + data = data_in.splitlines() + for idx, line in enumerate(data[:]): + # Get some preliminary replacements out of the way. + line = regex['nick'].sub(' ', line, 1) + line = regex['reset'].sub(reset_char, line) + line = regex['color_clear'].sub(def_fg + def_bg, line) + # TODO: use ptrn.sub(_repl, line) instead since that works and + # this does not + # First set is text color + # Second set, if present, is bg color + line = color_ptrn.sub(_repl, line) + data[idx] = line + return('\n'.join(data)) + +def plain_stripper(data_in): + # Strip to plaintext only. + data = data_in.splitlines() + ptrns = [re.compile('\x04(g|c|[389;]/?|e|>)/?'), + re.compile('((\x03)\d\d?,\d\d?|(\x03)\d\d?|[\x01-\x1F])')] + for idx, line in enumerate(data[:]): + # This cleans the nick field + l = re.sub('\x04[89]/', ' ', line, 1) + # And these clean the actual chat messages + for p in ptrns: + l = p.sub('', l) + data[idx] = l + return('\n'.join(data)) + +class irssiLogParser(object): def __init__(self, args, data = None): # We'll need these accessible across the entire class. self.args = args + # If specified, self.data takes precedence over self.args['logfile'] + # (if it was specified). self.data = data + self.raw = data self.has_html = False - # This is a map to determine which module to use to decompress, - # if we should. - self.cmprsn_map = {'text/plain': None, # Plain ol' text - 'application/octet-stream': None, # Sometimes the formatting with color gives this - 'application/x-bzip2': 'bz2', # Bzip2 - 'application/x-gzip': 'gzip', # Gzip - 'application/x-xz': 'lzma'} # XZ - # I though y'all liked GUIs. - # ANSI, which is interpreted by the shell. - # Only used if args['color'] = True - self.ansi_prefix = '\033[' - # irssi to ANSI - self.colormap = {'00': '1;37m', # White - '01': '0;30m', # Black - '02': '0;34m', # Blue - '03': '0;32m', # Green - '04': '1;31m', # Light Red - '05': '0;31m', # Red - '06': '0;35m', # Magenta (translated as Purple) - '07': '0;33m', # Orange (translated as Brown) - '08': '1;33m', # Yellow - '09': '1:32m', # Light Green - '10': '0;36m', # Cyan - '11': '1;36m', # Light Cyan - '12': '1;34m', # Light Blue - '13': '1;35m', # Light Magenta (translated as Light Purple) - '14': '0;37m', # Gray - '15': '1;37'} # Light Gray (translated as White) - # The full, interpreted path. - if 'logfile' in self.args.keys(): - self.args['logfile'] = os.path.abspath(os.path.expanduser(self.args['logfile'])) - if not self.data: - self.getLog() - else: - self.data = self.data.decode('utf-8').splitlines() self.decompress = None - if has_magic: - # Determine what decompressor to use, if we need to. - _mime = magic.detect_from_content(self.data).mime_type - self.decompress = self.cmprsn_map[_mime] - if self.args['html']: - try: - import ansi2html - self.has_html = True - except ImportError: - print(('Warning: you have selected HTML output but do not ' + - 'have the ansi2html module installed. Rendering HTML ' + - 'output is not possible.')) - self.has_html = False + if 'color' in self.args and self.args['color']: + if not self.args['html']: + # Ensure that we support color output. + curses.initscr() + self.args['color'] = curses.can_change_color() + curses.endwin() + if not self.args['color'] and not self.args['raw']: + raise RuntimeError('You have specified ANSI colorized ' + 'output but your terminal does not ' + 'support it. Use -fc/--force-color ' + 'to force.') + elif not self.args['color'] and self.args['raw']: + self.args['color'] = True # Force the output anyways. + if self.args['color']: + if not self.args['raw']: + self.colors = get_palette() + else: + self.colors = 8 # Best play it safe for maximum compatibility. + # The full, interpreted path. + if ('logfile' in self.args.keys() and + self.args['logfile'] is not None): + self.args['logfile'] = os.path.abspath( + os.path.expanduser( + self.args['logfile'])) + if not self.data: + self.getlog() else: - self.has_html = False + # Conform everything to bytes. + if not isinstance(self.data, bytes): + self.data = self.data.encode('utf-8') + self.decompressor() + self.parser() - def getLog(self): - if not os.path.isfile(self.args['logfile']): - raise FileNotFoundError('{0} does not exist.'.formatself.args['logfile']) - with open(self.args['logfile'], 'rb') as f: - self.data = f.read() + def getlog(self): + # A filepath was specified + if self.args['logfile']: + if not os.path.isfile(self.args['logfile']): + raise FileNotFoundError('{0} does not exist'.format( + self.args['logfile'])) + with open(self.args['logfile'], 'rb') as f: + self.data = f.read() + # Try to get it from stdin + else: + if not sys.stdin.isatty(): + self.data = sys.stdin.buffer.read() + else: + raise ValueError('Either a path to a logfile must be ' + 'specified or you must pipe a log in from ' + 'stdin.') + self.raw = self.data return() - def parseLog(self): - if self.decompress: - import importlib - self.decmp = importlib.import_module(self.decompress) - self.data = self.decmp.decompress(self.data) - if self.args['color']: - _idx = 0 - _datalst = self.data.split(b'\n') - for line in _datalst[:]: # not really "lines", per se, but... - # First we strip out some basic formatting at the beginning - # of lines. Status lines are \x049/, chat lines are \x048/. - # \x04g seem to be formatting resets of sort. - line = re.sub('\x04[89]/'.encode('utf-8'), - ''.encode('utf-8'), - line) - line = re.sub('\x04g'.encode('utf-8'), - ''.encode('utf-8'), - line) - # Formatting resets - line = re.sub('\x04e'.encode('utf-8'), - '\033[0m'.encode('utf-8'), - line) - # Then we substitute bolds in. This is trickier, because - # bolds (\x04c) *alternate*. So does the other? bold, \x02. - for b in ('\x04c'.encode('utf-8'), '\x02'.encode('utf-8')): - _linelst = line.split(b) - _bold = False - _cnt = 0 - for i in _linelst[:]: - if _bold: - _linelst[_cnt] = re.sub('^'.encode('utf-8'), - (self.ansi_prefix + '1m').encode('utf-8'), - i) - else: - _linelst[_cnt] = re.sub('^'.encode('utf-8'), - (self.ansi_prefix + '0m').encode('utf-8'), - i) - _cnt += 1 - _bold = not _bold - line = b''.join(_linelst) - # Then we handle colors. - _cnt = 0 - _linelst = line.split(b'\x03') - for i in _linelst[:]: - _color_idx = re.sub('^([0-9]{2}).*$'.encode('utf-8'), - '\g<1>', - i, - re.MULTILINE).decode('utf-8') - if _color_idx in self.colormap.keys(): - _linelst[_cnt] = re.sub('^[0-9]{2}'.encode('utf-8'), - (self.ansi_prefix + self.colormap[_color_idx]).encode('utf-8'), - i) - _cnt += 1 - line = b''.join(_linelst) - # Lastly, we fix join/part and other messages. - _cnt = 0 - _linelst = line.split(b'\x04;/') - for i in _linelst[:]: - _templine = re.sub('^'.encode('utf-8'), - ''.encode('utf-8'), - i, - re.MULTILINE) - _templine = re.sub('-!-'.encode('utf-8'), - '\033[2m-!-'.encode('utf-8'), - _templine) - _linelst[_cnt] = re.sub('\x043/'.encode('utf-8'), - ''.encode('utf-8'), - _templine) - _cnt += 1 - line = re.sub(b'^\x1b\[0;32m\x1b\[0m\x1b\[0m', b'\033[0m', b''.join(_linelst)) - # Lastly we strip out \x04>/ - line = re.sub(b'\x04>/', b'', line) - ### - _datalst[_idx] = line - _idx += 1 - ### - self.data = b'\n'.join(_datalst) - if self.args['html']: - try: - import ansi2html - _has_html = True - except ImportError: - print(('Warning: you have selected HTML output but do not ' + - 'have the ansi2html module installed. Rendering HTML ' + - 'output is not possible.')) - _has_html = False - else: - _has_html = False - if _has_html: - # This... basically sucks. It currently doesn't properly interpret the ANSI. - _html = ansi2html.Ansi2HTMLConverter() - self.data = _html.convert(self.data.decode('utf-8')) - else: # We want plaintext, so strip ALL formatting. - _stripbytes = ['\x04>/', '\x02', '\x043/', '\x048/', '\x049/', '\x04g', '\x04e', '\x04c', '\x04;/'] - for b in _stripbytes: - self.data = re.sub(b.encode('utf-8'), ''.encode('utf-8'), self.data) - self.data = re.sub('\\x03[0-9]{2}'.encode('utf-8'), ''.encode('utf-8'), self.data) + def decompressor(self): + # TODO: use mime module as fallback? + # https://docs.python.org/3/library/mimetypes.html + # VERY less-than-ideal since it won't work without self.args['logfile'] + # (and has iffy detection at best, since it relies on file extensions). + # Determine what decompressor to use, if we need to. + if has_magic: + _mime = magic.detect_from_content(self.data).mime_type + self.decompress = cmprsn_map[_mime] + if self.decompress: + import importlib + decmp = importlib.import_module(self.decompress) + self.raw = decmp.decompress(self.data) + else: + # Assume that it's text and that it isn't compressed. + # We'll get a UnicodeDecodeError exception if it isn't. + pass + try: + self.raw = self.data.decode('utf-8') + except UnicodeDecodeError: + pass + self.data = self.raw + return() + + def parser(self): + if 'color' not in self.args or not self.args['color']: + self.data = plain_stripper(self.data) + else: + self.data = color_converter(self.data, self.colors) + # Just in case... + self.data += '\x1b[0m' return() def parseArgs(): @@ -181,19 +277,41 @@ def parseArgs(): args.add_argument('-c', '--color', dest = 'color', action = 'store_true', - help = ('Print the log with converted colors.')) + help = ('Print the log with converted colors (ANSI)')) + args.add_argument('-r', '--raw', + dest = 'raw', + action = 'store_true', + help = ('Use this switch if your terminal is detected ' + 'as not supporting color output but wish to ' + 'force it anyways. A string representation of ' + 'the ANSI output will be produced instead (' + 'suitable for pasting elsewhere). Only used if ' + '-c/--color is enabled (ignored with ' + '-H/--html)')) args.add_argument('-H', '--html', dest = 'html', action = 'store_true', - help = ('Render HTML output.')) + help = ('Render HTML output')) args.add_argument(dest = 'logfile', + default = None, + nargs = '?', metavar = 'path/to/logfile', help = ('The path to the log file. It can be uncompressed ' + - 'or compressed with XZ/LZMA, Gzip, or Bzip2.')) + 'or compressed with XZ/LZMA, Gzip, or Bzip2. ' + 'If not specified, read from stdin')) return(args) if __name__ == '__main__': args = vars(parseArgs().parse_args()) - l = logParser(args) - l.parseLog() - print(l.data.decode('utf-8')) \ No newline at end of file + l = irssiLogParser(args) + import shutil + cols = shutil.get_terminal_size().columns + #pprint.pprint(l.args, width = cols) + pprint.pprint(l.raw, width = cols) + with open('/tmp/freenode.formatted', 'r') as f: + print(f.read()) + #pprint.pprint(l.data, width = cols) + #pprint.pprint(repr(l.data).split('\\n')) + print(l.data) + # l.parseLog() + # print(l.data.decode('utf-8'))