about to change up a lot of stuff...

2020-06-14 00:53:12 -04:00
parent 5526111743
commit 2ba79cd801
6 changed files with 192 additions and 13 deletions
--- a/sample.config.xml
+++ b/sample.config.xml
@@ -6,57 +6,108 @@
        xmlns="https://git.square-r00t.net/RepoMirror/"
        xsi:schemaLocation="https://git.square-r00t.net/RepoMirror/ http://schema.xml.r00t2.io/projects/repomirror.xsd">
  <distro name="arch">
    <!--
      If provided (and the sync script is running as the root user), the files/directories can be chowned to the
      provided user/group. Otherwise they'll be owned by whatever user the script is running as (and its primary group).
    -->
    <owner>
      <user>root</user>
      <group>root</group>
    </owner>
    <!--
      The local path to where the hierarchy/files should be synced to.
    -->
    <dest>/srv/repos/arch/.</dest>
    <!--
-      The local file to update with a timestamp with the last time we checked for updates.
+      The local file to update with a timestamp with the last time we *checked* for updates.
      If not provided, don't update a file (NOT recommended!).
      It may or may not be optional; check with the spec for mirroring for the specified distro.
      If the timeFormat attribute is provided, write the timestamp format in the specified format.
      See the following for details:
        * https://docs.python.org/library/datetime.html#strftime-and-strptime-format-codes
        * https://strftime.org/
      The default is to use a regular UNIX Epoch integer (e.g. June 13, 2020 5:03:53 PM UTC => 1592067833).
      This can be manually specified by the special string "UNIX_EPOCH".
      Optionally, you can use the special string "MICROSECOND_EPOCH", which will specify the above with microseconds.
      e.g. June 13, 2020 5:09:13.995777 PM UTC => 1592068153.995777
    -->
-    <lastLocalCheck>/srv/http/arch.lastcheck</lastLocalCheck>
+    <lastLocalCheck timeFormat="MICROSECOND_EPOCH">/srv/http/arch.lastcheck</lastLocalCheck>
    <!--
-      The file to update with a timestamp with the last time we synced from our upstream.
+      The file to update with a timestamp with the last time we *synced from our upstream*.
      If not provided, don't update a file (NOT recommended!).
      It may or may not be optional; check with the spec for mirroring for the specified distro.
      If not provided, don't update a file (NOT recommended!).
      It takes the same optional attribute "timeFormat" as above, with the same behaviour.
    -->
-    <lastLocalSync>/srv/http/arch.lastsync</lastLocalSync>
+    <lastLocalSync timeFormat="UNIX_EPOCH">/srv/repos/arch/lastsync</lastLocalSync>
    <!--
      The path to a file on the upstream(s) that gives a time when it last updated.
      The optional timeFormat attribute behavior is the same as above.
      If neither this nor lastRemoteSync is provided, a sync will be attempted regardless of when the last one was
      attempted.
    -->
-    <lastRemoteUpdate>/lastupdate</lastRemoteUpdate>
+    <lastRemoteUpdate timeFormat="UNIX_EPOCH">/lastupdate</lastRemoteUpdate>
    <!--
      The path to a file on the upstream(s) that gives a time when it last synced from its upstream.
      The optional timeFormat attribute behavior is the same as above.
      If neither this nor lastRemoteUpdate is provided, a sync will be attempted regardless of when the last one was
      attempted.
    -->
-    <lastRemoteSync>/lastsync</lastRemoteSync>
+    <lastRemoteSync timeFormat="UNIX_EPOCH">/lastsync</lastRemoteSync>
    <!--
      The path that must be currently mounted for sync to proceed.
      This is required.
    -->
    <mountCheck>/</mountCheck>
    <!--
-      The speed to cap socket bandwidth at (in KiB). Decimals are okay.
+      You cannot reliably use two dashes in XML strings, so this is a workaround.
      The following is only used for rsync upstreams and is optional. The default is just archive and delete-after.
      If arguments are provided, the defaults are overwritten so if you need the above, be sure to specify them.
      See the rsync man page (rsync(1)) for more details and a listing of supported flags on your system.
    -->
-    <bwlimit>7000</bwlimit>
+    <rsyncArgs>
      <!--
        A "long" option (two hyphens).
      -->
      <long>archive</long>
      <long>delete-after</long>
      <!--
        An argument with a value (info=2).
      -->
      <long value="2">info</long>
      <!--
        A "short" option (single hyphen).
      -->
      <short>c</short><!-- checksum -->
    </rsyncArgs>
    <upstream>
      <!--
        The following example uses "rsync://arch.mirror.constant.com/archlinux/"
        (https://www.archlinux.org/mirrors/constant.com/1008/)
      -->
      <!--
-        One of:
+        Required; one of:
          * rsync
          * ftp
      -->
      <syncType>rsync</syncType>
      <!--
-        ONLY the domain goes here.
+        Required; ONLY the domain goes here.
      -->
      <domain>arch.mirror.constant.com</domain>
      <!--
-        If not specified,the protocol's default port will be used.
+        Optional; if not specified,the protocol's default port will be used.
      -->
      <port>873</port>
      <!--
-        The *remote* path part of the URI. The leading / is necessary. A trailing one will be assumed.
+        Required; the *remote* path part of the URI. The leading / is necessary. A trailing one will be assumed.
      -->
      <path>/archlinux/</path>
      <!--
        The speed to cap socket bandwidth at (in KiB). Decimals are okay.
        Only valid for rsync; ignored for FTP. If not provided, the default is to not throttle.
      -->
      <bwlimit>7000</bwlimit>
    </upstream>
    <!--
      Multiple upstreams can be specified. They are tried in order specified and if connection fails or times out,
@@ -77,5 +128,14 @@
      <path>/distros/archlinux/</path>
    </upstream>
  </distro>
-  <distro name="centos"/>
+  <distro name="centos">
    <upstream>
      <syncType>rsync</syncType>
      <domain>mirrors.rit.edu</domain>
      <path>/centos/</path>
    </upstream>
    <dest>/srv/repos/arch/.</dest>
    <lastLocalCheck timeFormat="MICROSECOND_EPOCH">/srv/http/centos.lastcheck</lastLocalCheck>
    <lastLocalSync timeFormat="UNIX_EPOCH">/srv/repos/arch/lastsync</lastLocalSync>
  </distro>
 </mirror>
--- a/utils/find_fastest_upstream/archlinux.py
+++ b/utils/find_fastest_upstream/archlinux.py
@@ -0,0 +1,59 @@
 #!/usr/bin/env python3
 import datetime
 import re
 ##
 import iso3166
 ##
 import classes
 _strip_re = re.compile(r'^\s*(?P<num>[0-9.]+).*$')
 class Ranker(classes.Ranker):
    mirrorlist_url = 'https://www.archlinux.org/mirrors/status/tier/1/'
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.get_mirrors()
        self.mycountry = iso3166.countries_by_alpha2[self.my_info['country']].name
    def extract_mirrors(self):
        # Limit to only successful mirrors.
        mirrors = self.bs.find('table', {'id': 'successful_mirrors'})
        # Ayyy, thanks dude.
        # Modified from https://stackoverflow.com/a/56835562/733214.
        header = mirrors.find('thead').find('tr')
        headers = [h.text if h.text != '' else 'details' for h in header.find_all('th')]
        raw_rows = mirrors.find_all('tr')
        # rows = [{headers[i]: cell.text for i, cell in enumerate(row.find_all('td'))} for row in raw_rows]
        rows = [{headers[i]: cell for i, cell in enumerate(row.find_all('td'))} for row in raw_rows]
        for r in rows:
            for k, v in r.items():
                print(v)
                if k in ('Completion %', 'Mirror Score', 'μ Duration (s)', 'σ Duration (s)'):
                    r[k] = float(_strip_re.sub(r'\g<num>', v.text).strip())
                elif k == 'μ Delay (hh:mm)':
                    # HOO boy. Wish they just did it in seconds.
                # elif k == 'Country':
            self.raw_mirrors.append(r)
        # for row in rows:
        #     if not row:
        #         continue
        #     for k in ('Completion %', 'Mirror Score', 'μ Duration (s)', 'σ Duration (s)'):
        #         row[k] = float(_strip_re.sub(r'\g<num>', row[k]).strip())
        return(None)
 def main():
    r = Ranker()
    r.extract_mirrors()
    import pprint
    pprint.pprint(r.raw_mirrors)
    return(None)
 if __name__ == '__main__':
    main()
--- a/utils/find_fastest_upstream/centos.py
+++ b/utils/find_fastest_upstream/centos.py
--- a/utils/find_fastest_upstream/classes.py
+++ b/utils/find_fastest_upstream/classes.py
@@ -0,0 +1,39 @@
 import socket
 import time
 ##
 import requests
 from bs4 import BeautifulSoup
 ##
 import constants
 class Ranker(object):
    mirrorlist_url = None  # This is replaced by subclasses
    def __init__(self, parser = 'lxml', *args, **kwargs):
        self.my_info = {}
        self.raw_html = None
        self.parser = parser
        self.bs = None
        self.get_myinfo()
        self.raw_mirrors = []
    def extract_mirrors(self):
        # A dummy func. This should be overridden by subclasses.
        pass
        return(None)
    def get_myinfo(self):
        req = requests.get(constants.MYINFO_URL)
        if not req.ok:
            raise RuntimeError('Could not contact information gatherer')
        self.my_info = req.json()
        return(None)
    def get_mirrors(self):
        req = requests.get(self.mirrorlist_url)
        if not req.ok:
            raise RuntimeError('Could not contact information gatherer')
        self.raw_html = req.content.decode('utf-8')
        self.bs = BeautifulSoup(self.raw_html, self.parser)
        return(None)
--- a/utils/find_fastest_upstream/constants.py
+++ b/utils/find_fastest_upstream/constants.py
@@ -0,0 +1 @@
 MYINFO_URL = 'https://ipinfo.io'
--- a/utils/find_fastest_upstream/test.py
+++ b/utils/find_fastest_upstream/test.py
@@ -0,0 +1,20 @@
 #!/usr/bin/env python3
 import requests
 from bs4 import BeautifulSoup
 country = 'US'
 url = 'https://www.archlinux.org/mirrors/status/tier/1/'
 req = requests.get(url)
 html = req.content.decode('utf-8')
 bs = BeautifulSoup(html, 'lxml')
 mirrors = bs.find('table', {'id': 'successful_mirrors'})
 header = mirrors.find('thead').find('tr')
 headers = [h.text if h.text != '' else 'details' for h in header.find_all('th')]
 results = [{headers[i]: cell.text for i, cell in enumerate(row.find_all('td'))} for row in mirrors.find_all('tr')]
 import pprint
 pprint.pprint(results)