Source code for pex.crawler

# Copyright 2014 Pants project contributors (see
# Licensed under the Apache License, Version 2.0 (see LICENSE).

"""Support for webpage parsing and crawling."""

import os
import re
import threading
import traceback

from .compatibility import PY3
from .http import Context
from .link import Link
from .tracer import TRACER
from .util import Memoizer

if PY3:
  from queue import Empty, Queue
  from urllib.parse import urlparse
  from Queue import Empty, Queue
  from urlparse import urlparse

[docs]def unescape(s): """Unescapes html. Taken from""" s = s.replace("&lt;", "<") s = s.replace("&gt;", ">") # this has to be last: s = s.replace("&amp;", "&") return s
[docs]class PageParser(object): """A helper class to extract and differentiate ordinary and download links from webpages.""" HREF_RE = re.compile(r"""href=(?:"([^"]*)"|\'([^\']*)\'|([^>\s\n]*))""", re.I | re.S) REL_RE = re.compile(r"""<[^>]*\srel\s*=\s*['"]?([^'">]+)[^>]*>""", re.I) REL_SKIP_EXTENSIONS = frozenset(['.zip', '.tar', '.tar.gz', '.tar.bz2', '.tgz', '.exe']) REL_TYPES = frozenset(['homepage', 'download']) @classmethod def href_match_to_url(cls, match): def pick(group): return '' if group is None else group return unescape(pick( or pick( or pick(
def partition(L, pred): return filter(lambda v: not pred(v), L), filter(lambda v: pred(v), L)
[docs]class Crawler(object): """A multi-threaded crawler that supports local (disk) and remote (web) crawling.""" # Memoizer for calls to Crawler.crawl(). _CRAWL_CACHE = Memoizer()
[docs] @classmethod def reset_cache(cls): """Reset the internal crawl cache. This is intended primarily for tests.""" cls._CRAWL_CACHE = Memoizer()
@classmethod def crawl_local(cls, link): try: dirents = os.listdir(link.local_path) except OSError as e: TRACER.log('Failed to read %s: %s' % (link.local_path, e), V=1) return set(), set() files, dirs = partition([os.path.join(link.local_path, fn) for fn in dirents], os.path.isdir) return set(map(Link.from_filename, files)), set(map(Link.from_filename, dirs)) @classmethod def crawl_remote(cls, context, link): try: link = context.resolve(link) content = context.content(link) except context.Error as e: TRACER.log('Failed to read %s: %s' % (link.url, e), V=1) return set(), set() links = set(link.join(href) for href in PageParser.links(content)) rel_links = set(link.join(href) for href in PageParser.rel_links(content)) return links, rel_links @classmethod def crawl_link(cls, context, link): if link.local: return cls.crawl_local(link) elif link.remote: return cls.crawl_remote(context, link) else: TRACER.log('Failed to crawl %s: unknown scheme %s' % (link.url, link.scheme)) return set(), set() def __init__(self, context=None, threads=1): self._threads = threads self.context = context or Context.get() def _make_cache_key(self, links, follow_links): return (follow_links,) + tuple(links) def crawl(self, link_or_links, follow_links=False): links = list(Link.wrap_iterable(link_or_links)) cache_key = self._make_cache_key(links, follow_links) # Memoize crawling to a global Memoizer (Crawler._CRAWL_CACHE). result = self._CRAWL_CACHE.get(cache_key) if result is None: result = self._crawl(links, follow_links), result) return result def _crawl(self, link_or_links, follow_links): links, seen = set(), set() queue = Queue() converged = threading.Event() def execute(): while not converged.is_set(): try: link = queue.get(timeout=0.01) except Empty: continue if link not in seen: seen.add(link) try: roots, rels = self.crawl_link(self.context, link) except Exception as e: TRACER.log('Unknown exception encountered: %s' % e) for line in traceback.format_exc().splitlines(): TRACER.log(line) queue.task_done() continue links.update(roots) if follow_links: for rel in rels: if rel not in seen: queue.put(rel) queue.task_done() for i, link in enumerate(link_or_links): TRACER.log('crawling link i=%s link=%s follow_links=%s' % (i, link, follow_links), V=3) queue.put(link) workers = [] for _ in range(self._threads): worker = threading.Thread(target=execute) workers.append(worker) worker.daemon = True worker.start() queue.join() converged.set() # We deliberately do not join the worker threads, since they are no longer of any use to us. return links