check-in for mirror checker
This commit is contained in:
parent
54751f9753
commit
f47f2f8640
3
TODO
Normal file
3
TODO
Normal file
@ -0,0 +1,3 @@
|
|||||||
|
-git
|
||||||
|
-net/addr needs DNS/PTR/allocation stuff etc.
|
||||||
|
-net/mirroring
|
151
net/mirroring/check.py
Executable file
151
net/mirroring/check.py
Executable file
@ -0,0 +1,151 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import csv
|
||||||
|
import datetime
|
||||||
|
import difflib
|
||||||
|
import hashlib
|
||||||
|
import lzma
|
||||||
|
import os
|
||||||
|
import pickle
|
||||||
|
from urllib.request import urlopen
|
||||||
|
|
||||||
|
# TODO: to avoid race conditions, we should probably simply ignore/remove
|
||||||
|
# timestamps and just touch the cache file whenever checking.
|
||||||
|
|
||||||
|
class website(object):
|
||||||
|
def __init__(self, args, csvline):
|
||||||
|
# Field names
|
||||||
|
self.fnames = ('UUID', 'url', 'checksum', 'timestamp')
|
||||||
|
self.args = args
|
||||||
|
self.parseCSV([csvline])
|
||||||
|
self.cache = args['cache_dir']
|
||||||
|
self.cacheControl()
|
||||||
|
self.remoteFetch()
|
||||||
|
return
|
||||||
|
|
||||||
|
def parseCSV(self, data):
|
||||||
|
_rows = csv.DictReader(data,
|
||||||
|
fieldnames = self.fnames,
|
||||||
|
delimiter = ',',
|
||||||
|
quotechar = '"')
|
||||||
|
for r in _rows:
|
||||||
|
self.meta = r
|
||||||
|
break # We only want one, so if we SOMEHOW got more than one line...
|
||||||
|
return()
|
||||||
|
|
||||||
|
def cacheControl(self):
|
||||||
|
os.makedirs(self.cache, exist_ok = True)
|
||||||
|
self.site = {}
|
||||||
|
_cachefile = os.path.join(self.cache, self.meta['UUID'])
|
||||||
|
if os.path.isfile(_cachefile):
|
||||||
|
with lzma.open(_cachefile, mode = 'rb') as f:
|
||||||
|
self.site['local'] = pickle.load(f)
|
||||||
|
else:
|
||||||
|
with urlopen(self.meta['url']) as _site,\
|
||||||
|
lzma.open(_cachefile,
|
||||||
|
mode = 'wb',
|
||||||
|
check = lzma.CHECK_SHA256,
|
||||||
|
preset = 9|lzma.PRESET_EXTREME) as f:
|
||||||
|
_data = _site.read().decode('utf-8')
|
||||||
|
pickle.dump(_data, f)
|
||||||
|
self.site['local'] = _data
|
||||||
|
self.meta['timestamp'] = str(int(datetime.datetime.now().timestamp()))
|
||||||
|
_hash = hashlib.sha256(self.site['local'].encode('utf-8'))
|
||||||
|
self.meta['checksum'] = str(_hash.hexdigest())
|
||||||
|
return()
|
||||||
|
|
||||||
|
def remoteFetch(self):
|
||||||
|
with urlopen(self.meta['url']) as _site:
|
||||||
|
self.site['remote'] = _site.read().decode('utf-8')
|
||||||
|
_hash = hashlib.sha256(self.site['remote'].encode('utf-8'))
|
||||||
|
self.site['remotesum'] = str(_hash.hexdigest())
|
||||||
|
self.meta['timestamp'] = str(int(datetime.datetime.now().timestamp()))
|
||||||
|
return()
|
||||||
|
|
||||||
|
def compare(self):
|
||||||
|
# Don't even compare if the checksums match.
|
||||||
|
if self.site['remotesum'] == self.meta['checksum']:
|
||||||
|
self.diff = None
|
||||||
|
#print('{0}: Doing nothing'.format(self.meta['UUID']))
|
||||||
|
return()
|
||||||
|
print('{{{0}}}: "{1}":'.format(self.meta['UUID'], self.meta['url']))
|
||||||
|
diff = difflib.unified_diff(self.site['local'].splitlines(1),
|
||||||
|
self.site['remote'].splitlines(1))
|
||||||
|
self.diff = ''.join(diff)
|
||||||
|
print(self.diff)
|
||||||
|
with urlopen(self.meta['url']) as _site,\
|
||||||
|
lzma.open(os.path.join(self.cache, self.meta['UUID']),
|
||||||
|
mode = 'wb',
|
||||||
|
check = lzma.CHECK_SHA256,
|
||||||
|
preset = 9|lzma.PRESET_EXTREME) as f:
|
||||||
|
_data = _site.read().decode('utf-8')
|
||||||
|
pickle.dump(_data, f)
|
||||||
|
return()
|
||||||
|
|
||||||
|
def writeCSV(self):
|
||||||
|
#if self.diff: # We actually WANT to write, because we're updating the last fetch timestamp.
|
||||||
|
_lines = []
|
||||||
|
with open(self.args['urls_csv'], 'r') as f:
|
||||||
|
_f = f.read()
|
||||||
|
_rows = csv.DictReader(_f.splitlines(),
|
||||||
|
fieldnames = self.fnames,
|
||||||
|
delimiter = ',',
|
||||||
|
quotechar = '"')
|
||||||
|
for r in _rows:
|
||||||
|
_uuid = r['UUID']
|
||||||
|
if _uuid == self.meta['UUID']:
|
||||||
|
r['checksum'] = self.site['remotesum']
|
||||||
|
r['timestamp'] = self.meta['timestamp']
|
||||||
|
_lines.append(r)
|
||||||
|
with open(self.args['urls_csv'], 'w', newline = '') as f:
|
||||||
|
_w = csv.DictWriter(f,
|
||||||
|
fieldnames = self.fnames,
|
||||||
|
delimiter = ',',
|
||||||
|
quotechar = '"',
|
||||||
|
quoting = csv.QUOTE_ALL)
|
||||||
|
_w.writerows(_lines)
|
||||||
|
return()
|
||||||
|
|
||||||
|
def parseArgs():
|
||||||
|
# Define defaults
|
||||||
|
_self_dir = os.path.dirname(os.path.realpath(__file__))
|
||||||
|
_cache_dir = os.path.join(_self_dir, 'cache')
|
||||||
|
_urls_csv = os.path.join(_self_dir, 'urls.csv')
|
||||||
|
args = argparse.ArgumentParser()
|
||||||
|
args.add_argument('-c',
|
||||||
|
'--cache-dir',
|
||||||
|
metavar = '/path/to/cache/dir/',
|
||||||
|
default = _cache_dir,
|
||||||
|
dest = 'cache_dir',
|
||||||
|
type = str,
|
||||||
|
help = ('The path to where cached versions of websites are stored. ' +
|
||||||
|
'They are stored in the python binary "pickle" format. ' +
|
||||||
|
'Default: \n\n\t\033[1m{0}\033[0m').format(_cache_dir))
|
||||||
|
args.add_argument('-u',
|
||||||
|
'--urls',
|
||||||
|
metavar = '/path/to/urls.csv',
|
||||||
|
default = _urls_csv,
|
||||||
|
dest = 'urls_csv',
|
||||||
|
type = str,
|
||||||
|
help = ('The path to where a CSV file of the URLs to check should be. ' +
|
||||||
|
'Note that it should be writeable by whatever user the script is running as.' +
|
||||||
|
'See urls.csv.spec for the specification. ' +
|
||||||
|
'Default: \n\n\t\033[1m{0}\033[0m').format(_urls_csv))
|
||||||
|
return(args)
|
||||||
|
|
||||||
|
def main():
|
||||||
|
args = vars(parseArgs().parse_args())
|
||||||
|
for d in ('cache_dir', 'urls_csv'):
|
||||||
|
args[d] = os.path.realpath(os.path.expanduser(args[d]))
|
||||||
|
with open(args['urls_csv'], 'r', newline = '') as f:
|
||||||
|
_csv = f.read()
|
||||||
|
for line in _csv.splitlines():
|
||||||
|
w = website(args, line)
|
||||||
|
w.compare()
|
||||||
|
w.writeCSV()
|
||||||
|
if w.diff:
|
||||||
|
print(w.diff)
|
||||||
|
|
||||||
|
if __name__ == '__main__':
|
||||||
|
main()
|
15
net/mirroring/urls.csv.spec
Normal file
15
net/mirroring/urls.csv.spec
Normal file
@ -0,0 +1,15 @@
|
|||||||
|
"UUID","URL","SHA512_of_CONTENT","LAST_FETCHED_IN_UNIX_EPOCH"
|
||||||
|
|
||||||
|
UUID can be any non-whitespace, non-slashed string suitable for filenames you want, but I recommend a UUID4.
|
||||||
|
You can generate one at either https://www.uuidgenerator.net/ or via python:
|
||||||
|
|
||||||
|
>>> import uuid
|
||||||
|
>>> str(uuid.uuid4())
|
||||||
|
'16728c9e-5fde-4f63-8a36-4a3db612be8d'
|
||||||
|
|
||||||
|
It should be unique for every page.
|
||||||
|
|
||||||
|
|
||||||
|
You can generate an UNIX Epoch timestamp via:
|
||||||
|
|
||||||
|
date '+%s'
|
Loading…
Reference in New Issue
Block a user