diff --git a/net/get_title.py b/net/get_title.py new file mode 100755 index 0000000..dcdebf2 --- /dev/null +++ b/net/get_title.py @@ -0,0 +1,78 @@ +#!/usr/bin/env python3 + +import argparse +try: + import requests as handler + has_req = True +except ImportError: + from urllib.request import urlopen as handler + has_req = False +try: + import lxml + parser = 'lxml' +except ImportError: + parser = 'html.parser' +from bs4 import BeautifulSoup + + +def_elem = 'title' + + +class InfoScraper(object): + def __init__(self, url, elem = def_elem, *args, **kwargs): + self.url = url + self.elem = elem + self.raw = None + self.str = None + self.soup = None + self._get_page() + + def _get_page(self): + if has_req: + self.raw = handler.get(self.url).content + else: + with handler(self.url) as fh: + self.raw = fh.read() + try: + self.str = self.raw.decode('utf-8') + except Exception: + pass + self.soup = BeautifulSoup(self.str, features = parser) + return(None) + + def find(self): + rtrn = [e for e in self.soup.find_all(self.elem)] + if len(rtrn) == 1: + rtrn = rtrn[0] + elif len(rtrn) == 0: + rtrn = None + return(rtrn) + + + +def parseArgs(): + args = argparse.ArgumentParser(description = 'Get quick information from a URL at a glance') + args.add_argument('-e', '--elem', + default = def_elem, + help = ('The element(s) you want to scrape from the page. This is likely just going to be "{0}" (the default)').format(def_elem)) + args.add_argument('url', + metavar = 'URL', + help = ('The URL to parse. It may need to be quoted or escaped depending on the URL and what shell you\'re using')) + return(args) + + +def main(): + args = parseArgs().parse_args() + i = InfoScraper(**vars(args)) + rslts = i.find() + if isinstance(rslts, list): + for i in rslts: + print(i.text) + else: + print(rslts.text) + return(None) + + +if __name__ == '__main__': + main() +