From b1b1e99abc6ea9712bf9b362692165c332d9aa05 Mon Sep 17 00:00:00 2001 From: warmonger Date: Thu, 12 Feb 2015 17:03:14 +0600 Subject: [PATCH 1/8] autopep8, + .travis --- .travis.yml | 15 ++++++ readability/cleaners.py | 35 +++++++----- readability/debug.py | 18 ++++--- readability/encoding.py | 13 +++-- readability/htmls.py | 23 +++++--- readability/readability.py | 108 ++++++++++++++++++++----------------- requirements.txt | 0 setup.py | 6 +-- tests/test_article_only.py | 5 +- 9 files changed, 137 insertions(+), 86 deletions(-) create mode 100644 .travis.yml create mode 100644 requirements.txt diff --git a/.travis.yml b/.travis.yml new file mode 100644 index 00000000..d8a19975 --- /dev/null +++ b/.travis.yml @@ -0,0 +1,15 @@ +language: python + +python: + - "2.7" + - "3.4" + +install: + - pip install pytest-cov --use-mirrors + - pip install pytest-pep8 --use-mirrors + - pip install coveralls --use-mirrors + +script: py.test ./tests/test* --pep8 readability -v --cov readability --cov-report term-missing + +after_success: + - coveralls \ No newline at end of file diff --git a/readability/cleaners.py b/readability/cleaners.py index 9b158c52..662ceec2 100644 --- a/readability/cleaners.py +++ b/readability/cleaners.py @@ -1,32 +1,39 @@ -# strip out a set of nuisance html attributes that can mess up rendering in RSS feeds +# strip out a set of nuisance html attributes that can mess up rendering +# in RSS feeds import re from lxml.html.clean import Cleaner -bad_attrs = ['width', 'height', 'style', '[-a-z]*color', 'background[-a-z]*', 'on*'] +bad_attrs = ['width', 'height', 'style', + '[-a-z]*color', 'background[-a-z]*', 'on*'] single_quoted = "'[^']+'" double_quoted = '"[^"]+"' non_space = '[^ "\'>]+' -htmlstrip = re.compile("<" # open - "([^>]+) " # prefix - "(?:%s) *" % ('|'.join(bad_attrs),) + # undesirable attributes - '= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) + # value - "([^>]*)" # postfix - ">" # end -, re.I) +htmlstrip = re.compile("<" # open + "([^>]+) " # prefix + # undesirable attributes + "(?:%s) *" % ('|'.join(bad_attrs),) + + # value + '= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) + + "([^>]*)" # postfix + ">" # end + , re.I) + def clean_attributes(html): while htmlstrip.search(html): html = htmlstrip.sub('<\\1\\2>', html) return html + def normalize_spaces(s): - if not s: return '' + if not s: + return '' """replace any sequence of whitespace characters with a single space""" return ' '.join(s.split()) html_cleaner = Cleaner(scripts=True, javascript=True, comments=True, - style=True, links=True, meta=False, add_nofollow=False, - page_structure=False, processing_instructions=True, embedded=False, - frames=False, forms=False, annoying_tags=False, remove_tags=None, - remove_unknown_tags=False, safe_attrs_only=False) + style=True, links=True, meta=False, add_nofollow=False, + page_structure=False, processing_instructions=True, embedded=False, + frames=False, forms=False, annoying_tags=False, remove_tags=None, + remove_unknown_tags=False, safe_attrs_only=False) diff --git a/readability/debug.py b/readability/debug.py index a5e644d8..0cf442ce 100644 --- a/readability/debug.py +++ b/readability/debug.py @@ -1,25 +1,29 @@ def save_to_file(text, filename): f = open(filename, 'wt') - f.write('') + f.write( + '') f.write(text.encode('utf-8')) f.close() -uids = {} +uids = {} + + def describe(node, depth=2): if not hasattr(node, 'tag'): return "[%s]" % type(node) name = node.tag - if node.get('id', ''): name += '#'+node.get('id') - if node.get('class', ''): - name += '.' + node.get('class').replace(' ','.') + if node.get('id', ''): + name += '#' + node.get('id') + if node.get('class', ''): + name += '.' + node.get('class').replace(' ', '.') if name[:4] in ['div#', 'div.']: name = name[3:] if name in ['tr', 'td', 'div', 'p']: if not node in uids: - uid = uids[node] = len(uids)+1 + uid = uids[node] = len(uids) + 1 else: uid = uids.get(node) name += "%02d" % (uid) if depth and node.getparent() is not None: - return name+' - '+describe(node.getparent(), depth-1) + return name + ' - ' + describe(node.getparent(), depth - 1) return name diff --git a/readability/encoding.py b/readability/encoding.py index fb4761df..366f7b8f 100644 --- a/readability/encoding.py +++ b/readability/encoding.py @@ -1,15 +1,17 @@ import re import chardet + def get_encoding(page): # Regex for XML and HTML Meta charset declaration charset_re = re.compile(r']', flags=re.I) - pragma_re = re.compile(r']', flags=re.I) + pragma_re = re.compile( + r']', flags=re.I) xml_re = re.compile(r'^<\?xml.*?encoding=["\']*(.+?)["\'>]') declared_encodings = (charset_re.findall(page) + - pragma_re.findall(page) + - xml_re.findall(page)) + pragma_re.findall(page) + + xml_re.findall(page)) # Try any declared encodings if len(declared_encodings) > 0: @@ -24,13 +26,14 @@ def get_encoding(page): text = re.sub(']*>\s*', ' ', page) enc = 'utf-8' if not text.strip() or len(text) < 10: - return enc # can't guess + return enc # can't guess res = chardet.detect(text) enc = res['encoding'] or 'utf-8' - #print '->', enc, "%.2f" % res['confidence'] + # print '->', enc, "%.2f" % res['confidence'] enc = custom_decode(enc) return enc + def custom_decode(encoding): """Overrides encoding when charset declaration or charset determination is a subset of a larger diff --git a/readability/htmls.py b/readability/htmls.py index 92598d4a..4240d034 100644 --- a/readability/htmls.py +++ b/readability/htmls.py @@ -3,10 +3,12 @@ from lxml.html import tostring import logging import lxml.html -import re, sys +import re +import sys utf8_parser = lxml.html.HTMLParser(encoding='utf-8') + def build_doc(page): if isinstance(page, unicode): enc = None @@ -14,17 +16,19 @@ def build_doc(page): else: enc = get_encoding(page) or 'utf-8' page_unicode = page.decode(enc, 'replace') - doc = lxml.html.document_fromstring(page_unicode.encode('utf-8', 'replace'), parser=utf8_parser) + doc = lxml.html.document_fromstring( + page_unicode.encode('utf-8', 'replace'), parser=utf8_parser) return doc, enc + def js_re(src, pattern, flags, repl): return re.compile(pattern, flags).sub(src, repl.replace('$', '\\')) def normalize_entities(cur_title): entities = { - u'\u2014':'-', - u'\u2013':'-', + u'\u2014': '-', + u'\u2013': '-', u'—': '-', u'–': '-', u'\u00A0': ' ', @@ -38,9 +42,11 @@ def normalize_entities(cur_title): return cur_title + def norm_title(title): return normalize_entities(normalize_spaces(title)) + def get_title(doc): title = doc.find('.//title') if title is None or len(title.text) == 0: @@ -48,12 +54,14 @@ def get_title(doc): return norm_title(title.text) + def add_match(collection, text, orig): text = norm_title(text) if len(text.split()) >= 2 and len(text) >= 15: if text.replace('"', '') in orig.replace('"', ''): collection.add(text) + def shorten_title(doc): title = doc.find('.//title') if title is None or title.text is None or len(title.text) == 0: @@ -102,13 +110,14 @@ def shorten_title(doc): return title + def get_body(doc): - [ elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style') ] + [elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style')] raw_html = unicode(tostring(doc.body or doc)) cleaned = clean_attributes(raw_html) try: - #BeautifulSoup(cleaned) #FIXME do we really need to try loading it? + # BeautifulSoup(cleaned) #FIXME do we really need to try loading it? return cleaned - except Exception: #FIXME find the equivalent lxml error + except Exception: # FIXME find the equivalent lxml error #logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned)) return raw_html diff --git a/readability/readability.py b/readability/readability.py index 9b393d08..45c81fb2 100755 --- a/readability/readability.py +++ b/readability/readability.py @@ -33,7 +33,8 @@ #'normalizeRe': re.compile('\s{2,}/'), #'killBreaksRe': re.compile('((\s| ?)*){1,}/'), #'videoRe': re.compile('http:\/\/(www\.)?(youtube|vimeo)\.com', re.I), - #skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation needed)\s*$/i, + # skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation + # needed)\s*$/i, } @@ -78,6 +79,7 @@ def text_length(i): regexp_type = type(re.compile('hello, world')) + def compile_pattern(elements): if not elements: return None @@ -87,7 +89,9 @@ def compile_pattern(elements): elements = elements.split(',') return re.compile(u'|'.join([re.escape(x.lower()) for x in elements]), re.U) + class Document: + """Class to build a etree document out of html.""" TEXT_LENGTH_THRESHOLD = 25 RETRY_LENGTH = 250 @@ -139,7 +143,7 @@ def short_title(self): return shorten_title(self._html(True)) def get_clean_html(self): - return clean_attributes(tounicode(self.html)) + return clean_attributes(tounicode(self.html)) def summary(self, html_partial=False): """Generate the summary of the html docuemnt @@ -165,7 +169,7 @@ def summary(self, html_partial=False): if best_candidate: article = self.get_article(candidates, best_candidate, - html_partial=html_partial) + html_partial=html_partial) else: if ruthless: log.debug("ruthless removal did not work. ") @@ -219,7 +223,7 @@ def get_article(self, candidates, best_candidate, html_partial=False): append = True sibling_key = sibling # HashableElement(sibling) if sibling_key in candidates and \ - candidates[sibling_key]['content_score'] >= sibling_score_threshold: + candidates[sibling_key]['content_score'] >= sibling_score_threshold: append = True if sibling.tag == "p": @@ -230,8 +234,8 @@ def get_article(self, candidates, best_candidate, html_partial=False): if node_length > 80 and link_density < 0.25: append = True elif node_length <= 80 \ - and link_density == 0 \ - and re.search('\.( |$)', node_content): + and link_density == 0 \ + and re.search('\.( |$)', node_content): append = True if append: @@ -241,12 +245,13 @@ def get_article(self, candidates, best_candidate, html_partial=False): output.append(sibling) else: output.getchildren()[0].getchildren()[0].append(sibling) - #if output is not None: + # if output is not None: # output.append(best_elem) return output def select_best_candidate(self, candidates): - sorted_candidates = sorted(candidates.values(), key=lambda x: x['content_score'], reverse=True) + sorted_candidates = sorted( + candidates.values(), key=lambda x: x['content_score'], reverse=True) for candidate in sorted_candidates[:5]: elem = candidate['elem'] self.debug("Top 5 : %6.3f %s" % ( @@ -263,7 +268,7 @@ def get_link_density(self, elem): link_length = 0 for i in elem.findall(".//a"): link_length += text_length(i) - #if len(elem.findall(".//div") or elem.findall(".//p")): + # if len(elem.findall(".//div") or elem.findall(".//p")): # link_length = link_length total_length = text_length(elem) return float(link_length) / max(total_length, 1) @@ -300,13 +305,14 @@ def score_paragraphs(self, ): content_score = 1 content_score += len(inner_text.split(',')) content_score += min((inner_text_len / 100), 3) - #if elem not in candidates: + # if elem not in candidates: # candidates[elem] = self.score_node(elem) - #WTF? candidates[elem]['content_score'] += content_score + # WTF? candidates[elem]['content_score'] += content_score candidates[parent_node]['content_score'] += content_score if grand_parent_node is not None: - candidates[grand_parent_node]['content_score'] += content_score / 2.0 + candidates[grand_parent_node][ + 'content_score'] += content_score / 2.0 # Scale the final candidates score based on link density. Good content # should have a relatively small link density (5% or less) and be @@ -340,10 +346,10 @@ def class_weight(self, e): if self.negative_keywords and self.negative_keywords.search(feature): weight -= 25 - if self.positive_keywords and self.positive_keywords.match('tag-'+e.tag): + if self.positive_keywords and self.positive_keywords.match('tag-' + e.tag): weight += 25 - if self.negative_keywords and self.negative_keywords.match('tag-'+e.tag): + if self.negative_keywords and self.negative_keywords.match('tag-' + e.tag): weight -= 25 return weight @@ -373,7 +379,7 @@ def remove_unlikely_candidates(self): s = "%s %s" % (elem.get('class', ''), elem.get('id', '')) if len(s) < 2: continue - #self.debug(s) + # self.debug(s) if REGEXES['unlikelyCandidatesRe'].search(s) and (not REGEXES['okMaybeItsACandidateRe'].search(s)) and elem.tag not in ['html', 'body']: self.debug("Removing unlikely candidate - %s" % describe(elem)) elem.drop_tree() @@ -382,7 +388,7 @@ def transform_misused_divs_into_paragraphs(self): for elem in self.tags(self.html, 'div'): # transform
s that do not contain other block elements into #

s - #FIXME: The current implementation ignores all descendants that + # FIXME: The current implementation ignores all descendants that # are not direct children of elem # This results in incorrect results in case there is an # buried within an for example @@ -390,7 +396,7 @@ def transform_misused_divs_into_paragraphs(self): unicode(''.join(map(tostring, list(elem))))): #self.debug("Altering %s to p" % (describe(elem))) elem.tag = "p" - #print "Fixed element "+describe(elem) + # print "Fixed element "+describe(elem) for elem in self.tags(self.html, 'div'): if elem.text and elem.text.strip(): @@ -398,7 +404,7 @@ def transform_misused_divs_into_paragraphs(self): p.text = elem.text elem.text = None elem.insert(0, p) - #print "Appended "+tounicode(p)+" to "+describe(elem) + # print "Appended "+tounicode(p)+" to "+describe(elem) for pos, child in reversed(list(enumerate(elem))): if child.tail and child.tail.strip(): @@ -406,9 +412,9 @@ def transform_misused_divs_into_paragraphs(self): p.text = child.tail child.tail = None elem.insert(pos + 1, p) - #print "Inserted "+tounicode(p)+" to "+describe(elem) + # print "Inserted "+tounicode(p)+" to "+describe(elem) if child.tag == 'br': - #print 'Dropped
at '+describe(elem) + # print 'Dropped
at '+describe(elem) child.drop_tree() def tags(self, node, *tag_names): @@ -423,7 +429,7 @@ def reverse_tags(self, node, *tag_names): def sanitize(self, node, candidates): MIN_LEN = self.options.get('min_text_length', - self.TEXT_LENGTH_THRESHOLD) + self.TEXT_LENGTH_THRESHOLD) for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"): if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33: header.drop_tree() @@ -438,14 +444,14 @@ def sanitize(self, node, candidates): weight = self.class_weight(el) if el in candidates: content_score = candidates[el]['content_score'] - #print '!',el, '-> %6.3f' % content_score + # print '!',el, '-> %6.3f' % content_score else: content_score = 0 tag = el.tag if weight + content_score < 0: self.debug("Cleaned %s with score %6.3f and weight %-3s" % - (describe(el), content_score, weight, )) + (describe(el), content_score, weight, )) el.drop_tree() elif el.text_content().count(",") < 10: counts = {} @@ -460,19 +466,20 @@ def sanitize(self, node, candidates): parent_node = el.getparent() if parent_node is not None: if parent_node in candidates: - content_score = candidates[parent_node]['content_score'] + content_score = candidates[ + parent_node]['content_score'] else: content_score = 0 - #if parent_node is not None: + # if parent_node is not None: #pweight = self.class_weight(parent_node) + content_score #pname = describe(parent_node) - #else: + # else: #pweight = 0 #pname = "no parent" to_remove = False reason = "" - #if el.tag == 'div' and counts["img"] >= 1: + # if el.tag == 'div' and counts["img"] >= 1: # continue if counts["p"] and counts["img"] > counts["p"]: reason = "too many images (%s)" % counts["img"] @@ -487,9 +494,9 @@ def sanitize(self, node, candidates): reason = "too short content length %s without a single image" % content_length to_remove = True elif weight < 25 and link_density > 0.2: - reason = "too many links %.3f for its weight %s" % ( - link_density, weight) - to_remove = True + reason = "too many links %.3f for its weight %s" % ( + link_density, weight) + to_remove = True elif weight >= 25 and link_density > 0.5: reason = "too many links %.3f for its weight %s" % ( link_density, weight) @@ -516,27 +523,27 @@ def sanitize(self, node, candidates): # for desnode in self.tags(el, "table", "ul", "div"): # allowed[desnode] = True - #find x non empty preceding and succeeding siblings + # find x non empty preceding and succeeding siblings i, j = 0, 0 x = 1 siblings = [] for sib in el.itersiblings(): - #self.debug(sib.text_content()) + # self.debug(sib.text_content()) sib_content_length = text_length(sib) if sib_content_length: - i =+ 1 + i = + 1 siblings.append(sib_content_length) if i == x: break for sib in el.itersiblings(preceding=True): - #self.debug(sib.text_content()) + # self.debug(sib.text_content()) sib_content_length = text_length(sib) if sib_content_length: - j =+ 1 + j = + 1 siblings.append(sib_content_length) if j == x: break - #self.debug(str(siblings)) + # self.debug(str(siblings)) if siblings and sum(siblings) > 1000: to_remove = False self.debug("Allowing %s" % describe(el)) @@ -545,14 +552,14 @@ def sanitize(self, node, candidates): if to_remove: self.debug("Cleaned %6.3f %s with weight %s cause it has %s." % - (content_score, describe(el), weight, reason)) - #print tounicode(el) + (content_score, describe(el), weight, reason)) + # print tounicode(el) #self.debug("pname %s pweight %.3f" %(pname, pweight)) el.drop_tree() for el in ([node] + [n for n in node.iter()]): if not self.options.get('attributes', None): - #el.attrib = {} #FIXME:Checkout the effects of disabling this + # el.attrib = {} #FIXME:Checkout the effects of disabling this pass self.html = node @@ -560,6 +567,7 @@ def sanitize(self, node, candidates): class HashableElement(): + def __init__(self, node): self.node = node self._path = None @@ -590,9 +598,12 @@ def main(): from optparse import OptionParser parser = OptionParser(usage="%prog: [options] [file]") parser.add_option('-v', '--verbose', action='store_true') - parser.add_option('-u', '--url', default=None, help="use URL instead of a local file") - parser.add_option('-p', '--positive-keywords', default=None, help="positive keywords (separated with comma)", action='store') - parser.add_option('-n', '--negative-keywords', default=None, help="negative keywords (separated with comma)", action='store') + parser.add_option( + '-u', '--url', default=None, help="use URL instead of a local file") + parser.add_option('-p', '--positive-keywords', default=None, + help="positive keywords (separated with comma)", action='store') + parser.add_option('-n', '--negative-keywords', default=None, + help="negative keywords (separated with comma)", action='store') (options, args) = parser.parse_args() if not (len(args) == 1 or options.url): @@ -605,14 +616,15 @@ def main(): file = urllib.urlopen(options.url) else: file = open(args[0], 'rt') - enc = sys.__stdout__.encoding or 'utf-8' # XXX: this hack could not always work, better to set PYTHONIOENCODING + # XXX: this hack could not always work, better to set PYTHONIOENCODING + enc = sys.__stdout__.encoding or 'utf-8' try: print Document(file.read(), - debug=options.verbose, - url=options.url, - positive_keywords = options.positive_keywords, - negative_keywords = options.negative_keywords, - ).summary().encode(enc, 'replace') + debug=options.verbose, + url=options.url, + positive_keywords=options.positive_keywords, + negative_keywords=options.negative_keywords, + ).summary().encode(enc, 'replace') finally: file.close() diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 00000000..e69de29b diff --git a/setup.py b/setup.py index 29d599db..ea6eeb30 100755 --- a/setup.py +++ b/setup.py @@ -17,7 +17,7 @@ author="Yuri Baburov", author_email="burchik@gmail.com", description="fast python port of arc90's readability tool", - test_suite = "tests.test_article_only", + test_suite="tests.test_article_only", long_description=open("README").read(), license="Apache License 2.0", url="https://github.com/buriy/python-readability", @@ -25,11 +25,11 @@ install_requires=[ "chardet", lxml_requirement - ], + ], classifiers=[ "Environment :: Web Environment", "Intended Audience :: Developers", "Operating System :: OS Independent", "Programming Language :: Python", - ], + ], ) diff --git a/tests/test_article_only.py b/tests/test_article_only.py index 3a8f1c6e..9e3a9c82 100644 --- a/tests/test_article_only.py +++ b/tests/test_article_only.py @@ -13,6 +13,7 @@ def load_sample(filename): class TestArticleOnly(unittest.TestCase): + """The option to not get back a full html doc should work Given a full html document, the call can request just divs of processed @@ -33,7 +34,7 @@ def test_si_sample(self): def test_si_sample_html_partial(self): """Using the si sample, make sure we can get the article alone.""" sample = load_sample('si-game.sample.html') - doc = Document(sample, url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html') + doc = Document( + sample, url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html') res = doc.summary(html_partial=True) self.assertEqual('

]*)" # postfix - ">" # end + ">" # end , re.I) @@ -32,6 +34,7 @@ def normalize_spaces(s): characters with a single space""" return ' '.join(s.split()) + html_cleaner = Cleaner(scripts=True, javascript=True, comments=True, style=True, links=True, meta=False, add_nofollow=False, page_structure=False, processing_instructions=True, embedded=False, diff --git a/readability/debug.py b/readability/debug.py index 0cf442ce..fdb70a53 100644 --- a/readability/debug.py +++ b/readability/debug.py @@ -5,6 +5,7 @@ def save_to_file(text, filename): f.write(text.encode('utf-8')) f.close() + uids = {} diff --git a/readability/encoding.py b/readability/encoding.py index 366f7b8f..28de2cba 100644 --- a/readability/encoding.py +++ b/readability/encoding.py @@ -1,4 +1,5 @@ import re + import chardet diff --git a/readability/htmls.py b/readability/htmls.py index 4240d034..02826046 100644 --- a/readability/htmls.py +++ b/readability/htmls.py @@ -1,10 +1,10 @@ +import re + from cleaners import normalize_spaces, clean_attributes from encoding import get_encoding from lxml.html import tostring -import logging import lxml.html -import re -import sys + utf8_parser = lxml.html.HTMLParser(encoding='utf-8') @@ -78,7 +78,8 @@ def shorten_title(doc): if e.text_content(): add_match(candidates, e.text_content(), orig) - for item in ['#title', '#head', '#heading', '.pageTitle', '.news_title', '.title', '.head', '.heading', '.contentheading', '.small_header_red']: + for item in ['#title', '#head', '#heading', '.pageTitle', '.news_title', '.title', '.head', '.heading', + '.contentheading', '.small_header_red']: for e in doc.cssselect(item): if e.text: add_match(candidates, e.text, orig) @@ -119,5 +120,5 @@ def get_body(doc): # BeautifulSoup(cleaned) #FIXME do we really need to try loading it? return cleaned except Exception: # FIXME find the equivalent lxml error - #logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned)) + # logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned)) return raw_html diff --git a/readability/readability.py b/readability/readability.py index 45c81fb2..6ba80d59 100755 --- a/readability/readability.py +++ b/readability/readability.py @@ -3,12 +3,10 @@ import re import sys -from collections import defaultdict from lxml.etree import tostring from lxml.etree import tounicode from lxml.html import document_fromstring from lxml.html import fragment_fromstring - from cleaners import clean_attributes from cleaners import html_cleaner from htmls import build_doc @@ -20,14 +18,17 @@ logging.basicConfig(level=logging.INFO) log = logging.getLogger() - REGEXES = { - 'unlikelyCandidatesRe': re.compile('combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter', re.I), + 'unlikelyCandidatesRe': re.compile( + 'combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter', + re.I), 'okMaybeItsACandidateRe': re.compile('and|article|body|column|main|shadow', re.I), 'positiveRe': re.compile('article|body|content|entry|hentry|main|page|pagination|post|text|blog|story', re.I), - 'negativeRe': re.compile('combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget', re.I), + 'negativeRe': re.compile( + 'combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget', + re.I), 'divToPElementsRe': re.compile('<(a|blockquote|dl|div|img|ol|p|pre|table|ul)', re.I), - #'replaceBrsRe': re.compile('(]*>[ \n\r\t]*){2,}',re.I), + # 'replaceBrsRe': re.compile('(]*>[ \n\r\t]*){2,}',re.I), #'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I), #'trimRe': re.compile('^\s+|\s+$/'), #'normalizeRe': re.compile('\s{2,}/'), @@ -77,6 +78,7 @@ def clean(text): def text_length(i): return len(clean(i.text_content() or "")) + regexp_type = type(re.compile('hello, world')) @@ -91,7 +93,6 @@ def compile_pattern(elements): class Document: - """Class to build a etree document out of html.""" TEXT_LENGTH_THRESHOLD = 25 RETRY_LENGTH = 250 @@ -223,7 +224,7 @@ def get_article(self, candidates, best_candidate, html_partial=False): append = True sibling_key = sibling # HashableElement(sibling) if sibling_key in candidates and \ - candidates[sibling_key]['content_score'] >= sibling_score_threshold: + candidates[sibling_key]['content_score'] >= sibling_score_threshold: append = True if sibling.tag == "p": @@ -246,7 +247,7 @@ def get_article(self, candidates, best_candidate, html_partial=False): else: output.getchildren()[0].getchildren()[0].append(sibling) # if output is not None: - # output.append(best_elem) + # output.append(best_elem) return output def select_best_candidate(self, candidates): @@ -269,7 +270,7 @@ def get_link_density(self, elem): for i in elem.findall(".//a"): link_length += text_length(i) # if len(elem.findall(".//div") or elem.findall(".//p")): - # link_length = link_length + # link_length = link_length total_length = text_length(elem) return float(link_length) / max(total_length, 1) @@ -306,7 +307,7 @@ def score_paragraphs(self, ): content_score += len(inner_text.split(',')) content_score += min((inner_text_len / 100), 3) # if elem not in candidates: - # candidates[elem] = self.score_node(elem) + # candidates[elem] = self.score_node(elem) # WTF? candidates[elem]['content_score'] += content_score candidates[parent_node]['content_score'] += content_score @@ -380,7 +381,8 @@ def remove_unlikely_candidates(self): if len(s) < 2: continue # self.debug(s) - if REGEXES['unlikelyCandidatesRe'].search(s) and (not REGEXES['okMaybeItsACandidateRe'].search(s)) and elem.tag not in ['html', 'body']: + if REGEXES['unlikelyCandidatesRe'].search(s) and ( + not REGEXES['okMaybeItsACandidateRe'].search(s)) and elem.tag not in ['html', 'body']: self.debug("Removing unlikely candidate - %s" % describe(elem)) elem.drop_tree() @@ -394,7 +396,7 @@ def transform_misused_divs_into_paragraphs(self): # buried within an for example if not REGEXES['divToPElementsRe'].search( unicode(''.join(map(tostring, list(elem))))): - #self.debug("Altering %s to p" % (describe(elem))) + # self.debug("Altering %s to p" % (describe(elem))) elem.tag = "p" # print "Fixed element "+describe(elem) @@ -470,17 +472,17 @@ def sanitize(self, node, candidates): parent_node]['content_score'] else: content_score = 0 - # if parent_node is not None: - #pweight = self.class_weight(parent_node) + content_score - #pname = describe(parent_node) - # else: - #pweight = 0 - #pname = "no parent" + # if parent_node is not None: + # pweight = self.class_weight(parent_node) + content_score + #pname = describe(parent_node) + # else: + #pweight = 0 + #pname = "no parent" to_remove = False reason = "" # if el.tag == 'div' and counts["img"] >= 1: - # continue + # continue if counts["p"] and counts["img"] > counts["p"]: reason = "too many images (%s)" % counts["img"] to_remove = True @@ -504,24 +506,24 @@ def sanitize(self, node, candidates): elif (counts["embed"] == 1 and content_length < 75) or counts["embed"] > 1: reason = "s with too short content length, or too many s" to_remove = True -# if el.tag == 'div' and counts['img'] >= 1 and to_remove: -# imgs = el.findall('.//img') -# valid_img = False -# self.debug(tounicode(el)) -# for img in imgs: -# -# height = img.get('height') -# text_length = img.get('text_length') -# self.debug ("height %s text_length %s" %(repr(height), repr(text_length))) -# if to_int(height) >= 100 or to_int(text_length) >= 100: -# valid_img = True -# self.debug("valid image" + tounicode(img)) -# break -# if valid_img: -# to_remove = False -# self.debug("Allowing %s" %el.text_content()) -# for desnode in self.tags(el, "table", "ul", "div"): -# allowed[desnode] = True + # if el.tag == 'div' and counts['img'] >= 1 and to_remove: + # imgs = el.findall('.//img') + # valid_img = False + # self.debug(tounicode(el)) + # for img in imgs: + # + # height = img.get('height') + # text_length = img.get('text_length') + # self.debug ("height %s text_length %s" %(repr(height), repr(text_length))) + # if to_int(height) >= 100 or to_int(text_length) >= 100: + # valid_img = True + # self.debug("valid image" + tounicode(img)) + # break + # if valid_img: + # to_remove = False + # self.debug("Allowing %s" %el.text_content()) + # for desnode in self.tags(el, "table", "ul", "div"): + # allowed[desnode] = True # find x non empty preceding and succeeding siblings i, j = 0, 0 @@ -567,7 +569,6 @@ def sanitize(self, node, candidates): class HashableElement(): - def __init__(self, node): self.node = node self._path = None @@ -582,6 +583,7 @@ def _get_path(self): node = node.getparent() self._path = tuple(reverse_path) return self._path + path = property(_get_path) def __hash__(self): @@ -596,6 +598,7 @@ def __getattr__(self, tag): def main(): from optparse import OptionParser + parser = OptionParser(usage="%prog: [options] [file]") parser.add_option('-v', '--verbose', action='store_true') parser.add_option( @@ -613,6 +616,7 @@ def main(): file = None if options.url: import urllib + file = urllib.urlopen(options.url) else: file = open(args[0], 'rt') @@ -624,9 +628,10 @@ def main(): url=options.url, positive_keywords=options.positive_keywords, negative_keywords=options.negative_keywords, - ).summary().encode(enc, 'replace') + ).summary().encode(enc, 'replace') finally: file.close() + if __name__ == '__main__': main() diff --git a/requirements.txt b/requirements.txt index e69de29b..86c871ed 100644 --- a/requirements.txt +++ b/requirements.txt @@ -0,0 +1 @@ +lxml \ No newline at end of file diff --git a/setup.py b/setup.py index ea6eeb30..9f4bb71d 100755 --- a/setup.py +++ b/setup.py @@ -1,10 +1,13 @@ #!/usr/bin/env python -from setuptools import setup, find_packages import sys +from setuptools import setup + + lxml_requirement = "lxml" if sys.platform == 'darwin': import platform + mac_ver = platform.mac_ver()[0] mac_ver_no = int(mac_ver.split('.')[1]) if mac_ver_no < 9: diff --git a/tests/samples/si-game.sample.html b/tests/samples/si-game.sample.html index fab4f4fe..1a530758 100644 --- a/tests/samples/si-game.sample.html +++ b/tests/samples/si-game.sample.html @@ -1,762 +1,1265 @@ - - - - - Detroit Tigers vs. Kansas City Royals - Preview - April 16, 2012 - - - - - - - - - - - - - + + + + + Detroit Tigers vs. Kansas City Royals - Preview - April 16, 2012 + + + + + + + + + + + + - - - - - - - - - - - - - - - -
- - - - - -
- - - -
-
    -
  • + function siVideoPlayHead(cvpInstance, playheadTime, totalDuration) { + } + function siVideoAdStarted(cvpInstance, videoId) { + } - -
    Get the Wildcats Championship Package
    -
    Get the Wildcats Championship Package
    + function siVideoTrackingAdCountdown(seconds) { + } - - + function siVideoComplete(cvpInstance, videoId) { + } - -
  • -
  • -
  • -
  • + function siVideoPause(cvpInstance, videoId, paused) { + } - + function siVideoSeek() { + } + + + + - -
    Get MLB 2K 12 FREE
    -
    Get MLB 2K 12 FREE
    - - -
  • -
-
-
-
- + + + - - -
- - - - - - - - -
-
- -
-
- + + + + -
+ + + +
- -
+ + + + +
+
+
+
- - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - -
 123456789RHE
TIGERS            
ROYALS            
-
-
-
    -
  1. 8:10 PM ET - -
  2. -
+
+
+ + + +
+
    -
  • TigersVerlander -
    Verlander
    -
    0-1
    -
    2.2 ERA
    -
     
    -
     
  • -
  • RoyalsDuffy -
    Duffy
    -
    1-0
    -
    0 ERA
    -
     
    -
     
  • +
  • +
    + + + + +
    + Get the Wildcats Championship Package
    +
    + Get the Wildcats Championship Package
    + + + + + +
    +
  • +
  • +
    + +
    +
  • +
  • +
    + + + + +
    + Get MLB 2K 12 FREE
    +
    + Get MLB 2K 12 FREE
    + + + +
    +
-
-
-
- - - -
- - - - - - -
PREVIEWMATCHUPFAN COMMENTS
-
- - - -
-
-
- - -

Tigers-Royals Preview

-

- - Justin Verlander - has pitched well in each of his first two starts, though he doesn't have a win to show for those efforts. - -

-

- He hasn't had much trouble earning victories against the - Kansas City Royals - . - -

-

- Verlander looks to continue his mastery of the Royals when the - Detroit Tigers - visit Kauffman Stadium in the opener of a three-game series Monday night. - -

-

- The reigning AL - Cy Young - winner and MVP had a 2-0 lead through eight innings in both of his outings, but the Tigers weren't able to hold the lead. - -

-

Verlander (0-1, 2.20 ERA) allowed two hits before running into trouble in the ninth against Tampa Bay on Wednesday, getting - charged with four runs in 8 1-3 innings of a 4-2 defeat. -

"Once a couple guys got on, really the first time I've cranked it up like that - and lost a little bit of my consistency that - I'd had all day," Verlander said. "It's inexcusable. This loss rests solely on my shoulders." -

The right-hander did his part in his opening-day start against Boston on April 5, allowing two hits before the bullpen faltered. - Detroit ended up winning 3-2 with a run in the bottom of the ninth, though Verlander didn't earn a decision. -

-

That hasn't been the case in his last four starts against the Royals, winning each with a 1.82 ERA. Verlander is 13-2 with - a 2.40 ERA in 19 career starts versus Kansas City, and another win will give him more victories than he has against any other - team. He's also beaten Cleveland 13 times. -

-

Verlander is 8-2 with a 1.82 ERA lifetime at Kauffman Stadium, where the Royals (3-6) were swept in a three-game series against - the Indians with Sunday's 13-7 loss. -

-

- - Billy Butler - , who is 14 for 39 (.359) with two homers off Verlander, had an RBI single and is hitting .364 with four doubles and a homer - during a five-game hitting streak. - -

-

- Royals pitchers allowed seven home runs, 17 extra-base hits and 32 runs in the series, and manager - Ned Yost - turned to outfielder - Mitch Maier - in the ninth to pitched a scoreless inning Sunday. - -

"Let's hope it doesn't happen again," Maier said. "I don't like to be put in that situation, but we needed an inning." -

- Kansas City will look to bounce back with the help of another solid outing from - Danny Duffy - (1-0, 0.00), who allowed one hit and struck out eight in six innings of a 3-0 win over Oakland on Tuesday. - -

-

The left-hander will be seeking his first win against Detroit after going 0-2 with a 5.63 ERA in three starts versus the Tigers - as a rookie. -

-

- - Gerald Laird - was a triple short of the cycle and helped the Tigers (6-3) salvage the finale of a three-game series with a 5-2 victory over - Chicago on Sunday. - -

-

- - Rick Porcello - allowed one run in 7 2-3 innings to give Detroit's starting rotation its first victory. - -

"All the other starters have pitched well," Porcello said. "It's just the way it's happened so far." -

Verlander allowed three runs in seven innings of a 4-3 win over the Royals on Aug. 6, beating Duffy, who gave up three runs - over five. -

- -

- © 2011 STATS LLC STATS, Inc - -

-
-
- -
-
-
-
- -
-
-
- -
-
-
-
- - - -
- -
-
-
-
-
SI.com
-
Hot Topics: Peter King: MMQB NHL Playoffs Bobby Petrino Bobby Valentine Roger Clemens MLB Power Rankings Jackie Robinson
-
-
- -
-
- - Turner - SI Digital - -
Terms under which this service is provided to you. Read our privacy guidelines, your California privacy rights, and ad choices. -
-
-
SI CoverRead All ArticlesBuy Cover Reprint -
-
-
- - - - -
-
-
- - - - - - - +
+
+
+
+
+ + + + +
+ + + + + + + + +
+
+ +
+
+ + + +
+ +
+ + +
+ +
+ + + +
+
+ +
+ + + + + +
+
+
+
  + +
+ + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
 123456789RHE
TIGERS            
ROYALS            
+
+ +
+
+ + + +
+ + + + + + + +
PREVIEWMATCHUPFAN COMMENTS
+
+ + + +
+
+
+ + +

Tigers-Royals Preview

+ +

+ + Justin + Verlander + has pitched well in each of his first two starts, though he doesn't have a win to show for those + efforts. + +

+ +

+ He hasn't had much trouble earning victories against the + Kansas City + Royals + . + +

+ +

+ Verlander looks to continue his mastery of the Royals when the + Detroit + Tigers + visit Kauffman Stadium in the opener of a three-game series Monday night. + +

+ +

+ The reigning AL + Cy Young + winner and MVP had a 2-0 lead through eight innings in both of his outings, but the Tigers weren't + able to hold the lead. + +

+ +

Verlander (0-1, 2.20 ERA) allowed two hits before running into trouble in the ninth against Tampa Bay + on Wednesday, getting + charged with four runs in 8 1-3 innings of a 4-2 defeat. +

"Once a couple guys got on, really the first time I've cranked it up like that - and lost a little + bit of my consistency that + I'd had all day," Verlander said. "It's inexcusable. This loss rests solely on my shoulders." +

The right-hander did his part in his opening-day start against Boston on April 5, allowing two hits + before the bullpen faltered. + Detroit ended up winning 3-2 with a run in the bottom of the ninth, though Verlander didn't earn a + decision. +

+ +

That hasn't been the case in his last four starts against the Royals, winning each with a 1.82 ERA. + Verlander is 13-2 with + a 2.40 ERA in 19 career starts versus Kansas City, and another win will give him more victories than + he has against any other + team. He's also beaten Cleveland 13 times. +

+ +

Verlander is 8-2 with a 1.82 ERA lifetime at Kauffman Stadium, where the Royals (3-6) were swept in a + three-game series against + the Indians with Sunday's 13-7 loss. +

+ +

+ + Billy Butler + , who is 14 for 39 (.359) with two homers off Verlander, had an RBI single and is hitting .364 with + four doubles and a homer + during a five-game hitting streak. + +

+ +

+ Royals pitchers allowed seven home runs, 17 extra-base hits and 32 runs in the series, and manager + Ned Yost + turned to outfielder + Mitch Maier + in the ninth to pitched a scoreless inning Sunday. + +

"Let's hope it doesn't happen again," Maier said. "I don't like to be put in that situation, but we + needed an inning." +

+ Kansas City will look to bounce back with the help of another solid outing from + Danny Duffy + (1-0, 0.00), who allowed one hit and struck out eight in six innings of a 3-0 win over Oakland on + Tuesday. + +

+ +

The left-hander will be seeking his first win against Detroit after going 0-2 with a 5.63 ERA in + three starts versus the Tigers + as a rookie. +

+ +

+ + Gerald Laird + was a triple short of the cycle and helped the Tigers (6-3) salvage the finale of a three-game + series with a 5-2 victory over + Chicago on Sunday. + +

+ +

+ + Rick Porcello + allowed one run in 7 2-3 innings to give Detroit's starting rotation its first victory. + +

"All the other starters have pitched well," Porcello said. "It's just the way it's happened so far." +

Verlander allowed three runs in seven innings of a 4-3 win over the Royals on Aug. 6, beating Duffy, + who gave up three runs + over five. +

+ +

+ © 2011 STATS LLC STATS, Inc + +

+
+
+ +
+
+
+
+ +
+
+
+ +
+
+
+
+ + + +
+ + +
+ +
+
+
+
+
SI.com
+
Hot Topics: Peter King: MMQB NHL Playoffs Bobby Petrino Bobby + Valentine Roger Clemens MLB + Power Rankings Jackie Robinson
+
+
+ +
+
+ + Turner - SI Digital + + +
Terms under which this service is + provided to you. Read our privacy guidelines, your California privacy rights, and ad + choices. +
+
+
SI CoverRead All ArticlesBuy Cover Reprint +
+
+
+ + + + +
+
+
+
+
+ + + + + + +
- - - - - - - - - - + + + + + + + + + +
-
+
- + - + - + + + - - - - - \ No newline at end of file + + + \ No newline at end of file diff --git a/tests/test_article_only.py b/tests/test_article_only.py index 9e3a9c82..21b1fc03 100644 --- a/tests/test_article_only.py +++ b/tests/test_article_only.py @@ -13,7 +13,6 @@ def load_sample(filename): class TestArticleOnly(unittest.TestCase): - """The option to not get back a full html doc should work Given a full html document, the call can request just divs of processed From 3002ff5dc8f133b99e7fcc5715b5a104c2486e61 Mon Sep 17 00:00:00 2001 From: warmonger Date: Thu, 12 Feb 2015 17:08:28 +0600 Subject: [PATCH 3/8] refactoring --- readability/cleaners.py | 9 +- readability/htmls.py | 8 +- readability/readability.py | 66 +++-- tests/samples/si-game.sample.html | 390 ++++++++++++++++++++---------- tests/test_article_only.py | 3 +- 5 files changed, 316 insertions(+), 160 deletions(-) diff --git a/readability/cleaners.py b/readability/cleaners.py index c43396b4..8dc69fc7 100644 --- a/readability/cleaners.py +++ b/readability/cleaners.py @@ -15,7 +15,8 @@ # undesirable attributes "(?:%s) *" % ('|'.join(bad_attrs),) + # value - '= *(?:%s|%s|%s)' % (non_space, single_quoted, double_quoted) + + '= *(?:%s|%s|%s)' % ( + non_space, single_quoted, double_quoted) + "([^>]*)" # postfix ">" # end , re.I) @@ -37,6 +38,8 @@ def normalize_spaces(s): html_cleaner = Cleaner(scripts=True, javascript=True, comments=True, style=True, links=True, meta=False, add_nofollow=False, - page_structure=False, processing_instructions=True, embedded=False, - frames=False, forms=False, annoying_tags=False, remove_tags=None, + page_structure=False, processing_instructions=True, + embedded=False, + frames=False, forms=False, annoying_tags=False, + remove_tags=None, remove_unknown_tags=False, safe_attrs_only=False) diff --git a/readability/htmls.py b/readability/htmls.py index 02826046..0d1e9079 100644 --- a/readability/htmls.py +++ b/readability/htmls.py @@ -1,10 +1,11 @@ import re -from cleaners import normalize_spaces, clean_attributes -from encoding import get_encoding from lxml.html import tostring import lxml.html +from cleaners import normalize_spaces, clean_attributes +from encoding import get_encoding + utf8_parser = lxml.html.HTMLParser(encoding='utf-8') @@ -78,7 +79,8 @@ def shorten_title(doc): if e.text_content(): add_match(candidates, e.text_content(), orig) - for item in ['#title', '#head', '#heading', '.pageTitle', '.news_title', '.title', '.head', '.heading', + for item in ['#title', '#head', '#heading', '.pageTitle', '.news_title', + '.title', '.head', '.heading', '.contentheading', '.small_header_red']: for e in doc.cssselect(item): if e.text: diff --git a/readability/readability.py b/readability/readability.py index 6ba80d59..e5252268 100755 --- a/readability/readability.py +++ b/readability/readability.py @@ -7,6 +7,7 @@ from lxml.etree import tounicode from lxml.html import document_fromstring from lxml.html import fragment_fromstring + from cleaners import clean_attributes from cleaners import html_cleaner from htmls import build_doc @@ -22,15 +23,19 @@ 'unlikelyCandidatesRe': re.compile( 'combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter', re.I), - 'okMaybeItsACandidateRe': re.compile('and|article|body|column|main|shadow', re.I), - 'positiveRe': re.compile('article|body|content|entry|hentry|main|page|pagination|post|text|blog|story', re.I), + 'okMaybeItsACandidateRe': re.compile('and|article|body|column|main|shadow', + re.I), + 'positiveRe': re.compile( + 'article|body|content|entry|hentry|main|page|pagination|post|text|blog|story', + re.I), 'negativeRe': re.compile( 'combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget', re.I), - 'divToPElementsRe': re.compile('<(a|blockquote|dl|div|img|ol|p|pre|table|ul)', re.I), + 'divToPElementsRe': re.compile( + '<(a|blockquote|dl|div|img|ol|p|pre|table|ul)', re.I), # 'replaceBrsRe': re.compile('(]*>[ \n\r\t]*){2,}',re.I), - #'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I), - #'trimRe': re.compile('^\s+|\s+$/'), + # 'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I), + # 'trimRe': re.compile('^\s+|\s+$/'), #'normalizeRe': re.compile('\s{2,}/'), #'killBreaksRe': re.compile('((\s| ?)*){1,}/'), #'videoRe': re.compile('http:\/\/(www\.)?(youtube|vimeo)\.com', re.I), @@ -97,7 +102,8 @@ class Document: TEXT_LENGTH_THRESHOLD = 25 RETRY_LENGTH = 250 - def __init__(self, input, positive_keywords=None, negative_keywords=None, **options): + def __init__(self, input, positive_keywords=None, negative_keywords=None, + **options): """Generate the document :param input: string of the html content. @@ -224,7 +230,8 @@ def get_article(self, candidates, best_candidate, html_partial=False): append = True sibling_key = sibling # HashableElement(sibling) if sibling_key in candidates and \ - candidates[sibling_key]['content_score'] >= sibling_score_threshold: + candidates[sibling_key][ + 'content_score'] >= sibling_score_threshold: append = True if sibling.tag == "p": @@ -341,16 +348,20 @@ def class_weight(self, e): if REGEXES['positiveRe'].search(feature): weight += 25 - if self.positive_keywords and self.positive_keywords.search(feature): + if self.positive_keywords and self.positive_keywords.search( + feature): weight += 25 - if self.negative_keywords and self.negative_keywords.search(feature): + if self.negative_keywords and self.negative_keywords.search( + feature): weight -= 25 - if self.positive_keywords and self.positive_keywords.match('tag-' + e.tag): + if self.positive_keywords and self.positive_keywords.match( + 'tag-' + e.tag): weight += 25 - if self.negative_keywords and self.negative_keywords.match('tag-' + e.tag): + if self.negative_keywords and self.negative_keywords.match( + 'tag-' + e.tag): weight -= 25 return weight @@ -382,7 +393,8 @@ def remove_unlikely_candidates(self): continue # self.debug(s) if REGEXES['unlikelyCandidatesRe'].search(s) and ( - not REGEXES['okMaybeItsACandidateRe'].search(s)) and elem.tag not in ['html', 'body']: + not REGEXES['okMaybeItsACandidateRe'].search( + s)) and elem.tag not in ['html', 'body']: self.debug("Removing unlikely candidate - %s" % describe(elem)) elem.drop_tree() @@ -433,7 +445,8 @@ def sanitize(self, node, candidates): MIN_LEN = self.options.get('min_text_length', self.TEXT_LENGTH_THRESHOLD) for header in self.tags(node, "h1", "h2", "h3", "h4", "h5", "h6"): - if self.class_weight(header) < 0 or self.get_link_density(header) > 0.33: + if self.class_weight(header) < 0 or self.get_link_density( + header) > 0.33: header.drop_tree() for elem in self.tags(node, "form", "iframe", "textarea"): @@ -474,9 +487,9 @@ def sanitize(self, node, candidates): content_score = 0 # if parent_node is not None: # pweight = self.class_weight(parent_node) + content_score - #pname = describe(parent_node) + # pname = describe(parent_node) # else: - #pweight = 0 + # pweight = 0 #pname = "no parent" to_remove = False reason = "" @@ -492,7 +505,8 @@ def sanitize(self, node, candidates): elif counts["input"] > (counts["p"] / 3): reason = "less than 3x

s than s" to_remove = True - elif content_length < (MIN_LEN) and (counts["img"] == 0 or counts["img"] > 2): + elif content_length < (MIN_LEN) and ( + counts["img"] == 0 or counts["img"] > 2): reason = "too short content length %s without a single image" % content_length to_remove = True elif weight < 25 and link_density > 0.2: @@ -503,11 +517,12 @@ def sanitize(self, node, candidates): reason = "too many links %.3f for its weight %s" % ( link_density, weight) to_remove = True - elif (counts["embed"] == 1 and content_length < 75) or counts["embed"] > 1: + elif (counts["embed"] == 1 and content_length < 75) or counts[ + "embed"] > 1: reason = "s with too short content length, or too many s" to_remove = True - # if el.tag == 'div' and counts['img'] >= 1 and to_remove: - # imgs = el.findall('.//img') + # if el.tag == 'div' and counts['img'] >= 1 and to_remove: + # imgs = el.findall('.//img') # valid_img = False # self.debug(tounicode(el)) # for img in imgs: @@ -553,10 +568,11 @@ def sanitize(self, node, candidates): allowed[desnode] = True if to_remove: - self.debug("Cleaned %6.3f %s with weight %s cause it has %s." % - (content_score, describe(el), weight, reason)) + self.debug( + "Cleaned %6.3f %s with weight %s cause it has %s." % + (content_score, describe(el), weight, reason)) # print tounicode(el) - #self.debug("pname %s pweight %.3f" %(pname, pweight)) + # self.debug("pname %s pweight %.3f" %(pname, pweight)) el.drop_tree() for el in ([node] + [n for n in node.iter()]): @@ -604,9 +620,11 @@ def main(): parser.add_option( '-u', '--url', default=None, help="use URL instead of a local file") parser.add_option('-p', '--positive-keywords', default=None, - help="positive keywords (separated with comma)", action='store') + help="positive keywords (separated with comma)", + action='store') parser.add_option('-n', '--negative-keywords', default=None, - help="negative keywords (separated with comma)", action='store') + help="negative keywords (separated with comma)", + action='store') (options, args) = parser.parse_args() if not (len(args) == 1 or options.url): diff --git a/tests/samples/si-game.sample.html b/tests/samples/si-game.sample.html index 1a530758..1906087a 100644 --- a/tests/samples/si-game.sample.html +++ b/tests/samples/si-game.sample.html @@ -1,11 +1,15 @@ - + - Detroit Tigers vs. Kansas City Royals - Preview - April 16, 2012 - - + Detroit Tigers vs. Kansas City Royals - Preview - April 16, + 2012 + + - + - + - + @@ -378,16 +388,20 @@

Get the Wildcats Championship Package
+ title="Get the Wildcats Championship Package"/> +
Get the Wildcats Championship Package
+ title="Get the Wildcats Championship Package"/> +
Get MLB 2K 12 FREE
+ alt="Get MLB 2K 12 FREE" + title="Get MLB 2K 12 FREE"/>
Get MLB 2K 12 FREE
+ alt="Get MLB 2K 12 FREE" + title="Get MLB 2K 12 FREE"/>
-
SI.com Home +
SI.com Home
- -
Scores | Teams | Players | Player News | Standings + src="http://i.cdn.turner.com/si/.element/img/4.0/sect/baseball/mlb/icon.jpg"/> +
+ +
Scores | Teams | Players | Player News | Standings | Probables | Schedules | Stats + href="/baseball/mlb/schedules/weekly/today/">Schedules + | Stats | Transactions | Injuries | Tickets | - Tickets | + MLB.TV
@@ -514,17 +545,22 @@ @@ -694,7 +730,8 @@ NFL
  • - COLLEGE FOOTBALL + COLLEGE + FOOTBALL
  • MLB @@ -703,7 +740,8 @@ NBA
  • - COLLEGE BB + COLLEGE + BB
  • GOLF @@ -778,7 +816,9 @@ E - TIGERS + TIGERS +       @@ -793,7 +833,9 @@   - ROYALS + ROYALS +       @@ -816,9 +858,11 @@
  • @@ -865,96 +913,131 @@

    Tigers-Royals Preview

    - Justin + Justin Verlander - has pitched well in each of his first two starts, though he doesn't have a win to show for those + has pitched well in each of his first two starts, though he + doesn't have a win to show for those efforts.

    He hasn't had much trouble earning victories against the - Kansas City + Kansas + City Royals .

    - Verlander looks to continue his mastery of the Royals when the - Detroit + Verlander looks to continue his mastery of the Royals when + the + Detroit Tigers - visit Kauffman Stadium in the opener of a three-game series Monday night. + visit Kauffman Stadium in the opener of a three-game series + Monday night.

    The reigning AL - Cy Young - winner and MVP had a 2-0 lead through eight innings in both of his outings, but the Tigers weren't + Cy + Young + winner and MVP had a 2-0 lead through eight innings in both + of his outings, but the Tigers weren't able to hold the lead.

    -

    Verlander (0-1, 2.20 ERA) allowed two hits before running into trouble in the ninth against Tampa Bay +

    Verlander (0-1, 2.20 ERA) allowed two hits before running + into trouble in the ninth against Tampa Bay on Wednesday, getting charged with four runs in 8 1-3 innings of a 4-2 defeat. -

    "Once a couple guys got on, really the first time I've cranked it up like that - and lost a little +

    "Once a couple guys got on, really the first time I've + cranked it up like that - and lost a little bit of my consistency that - I'd had all day," Verlander said. "It's inexcusable. This loss rests solely on my shoulders." -

    The right-hander did his part in his opening-day start against Boston on April 5, allowing two hits + I'd had all day," Verlander said. "It's inexcusable. This loss + rests solely on my shoulders." +

    The right-hander did his part in his opening-day start + against Boston on April 5, allowing two hits before the bullpen faltered. - Detroit ended up winning 3-2 with a run in the bottom of the ninth, though Verlander didn't earn a + Detroit ended up winning 3-2 with a run in the bottom of the + ninth, though Verlander didn't earn a decision.

    -

    That hasn't been the case in his last four starts against the Royals, winning each with a 1.82 ERA. +

    That hasn't been the case in his last four starts against the + Royals, winning each with a 1.82 ERA. Verlander is 13-2 with - a 2.40 ERA in 19 career starts versus Kansas City, and another win will give him more victories than + a 2.40 ERA in 19 career starts versus Kansas City, and + another win will give him more victories than he has against any other team. He's also beaten Cleveland 13 times.

    -

    Verlander is 8-2 with a 1.82 ERA lifetime at Kauffman Stadium, where the Royals (3-6) were swept in a +

    Verlander is 8-2 with a 1.82 ERA lifetime at Kauffman + Stadium, where the Royals (3-6) were swept in a three-game series against the Indians with Sunday's 13-7 loss.

    - Billy Butler - , who is 14 for 39 (.359) with two homers off Verlander, had an RBI single and is hitting .364 with + Billy + Butler + , who is 14 for 39 (.359) with two homers off Verlander, had + an RBI single and is hitting .364 with four doubles and a homer during a five-game hitting streak.

    - Royals pitchers allowed seven home runs, 17 extra-base hits and 32 runs in the series, and manager - Ned Yost + Royals pitchers allowed seven home runs, 17 extra-base hits + and 32 runs in the series, and manager + Ned + Yost turned to outfielder - Mitch Maier + Mitch + Maier in the ninth to pitched a scoreless inning Sunday. -

    "Let's hope it doesn't happen again," Maier said. "I don't like to be put in that situation, but we +

    "Let's hope it doesn't happen again," Maier said. "I don't + like to be put in that situation, but we needed an inning."

    - Kansas City will look to bounce back with the help of another solid outing from - Danny Duffy - (1-0, 0.00), who allowed one hit and struck out eight in six innings of a 3-0 win over Oakland on + Kansas City will look to bounce back with the help of + another solid outing from + Danny + Duffy + (1-0, 0.00), who allowed one hit and struck out eight in six + innings of a 3-0 win over Oakland on Tuesday.

    -

    The left-hander will be seeking his first win against Detroit after going 0-2 with a 5.63 ERA in +

    The left-hander will be seeking his first win against Detroit + after going 0-2 with a 5.63 ERA in three starts versus the Tigers as a rookie.

    - Gerald Laird - was a triple short of the cycle and helped the Tigers (6-3) salvage the finale of a three-game + Gerald + Laird + was a triple short of the cycle and helped the Tigers (6-3) + salvage the finale of a three-game series with a 5-2 victory over Chicago on Sunday. @@ -962,18 +1045,25 @@

    Tigers-Royals Preview

    - Rick Porcello - allowed one run in 7 2-3 innings to give Detroit's starting rotation its first victory. + Rick + Porcello + allowed one run in 7 2-3 innings to give Detroit's starting + rotation its first victory. -

    "All the other starters have pitched well," Porcello said. "It's just the way it's happened so far." -

    Verlander allowed three runs in seven innings of a 4-3 win over the Royals on Aug. 6, beating Duffy, +

    "All the other starters have pitched well," Porcello said. + "It's just the way it's happened so far." +

    Verlander allowed three runs in seven innings of a 4-3 win + over the Royals on Aug. 6, beating Duffy, who gave up three runs over five.

    - © 2011 STATS LLC STATS, Inc + © 2011 + STATS LLC STATS, Inc

    @@ -984,15 +1074,20 @@

    Tigers-Royals Preview

    @@ -1122,11 +1243,13 @@

    Tigers-Royals Preview

    -
    +
    @@ -1196,7 +1319,8 @@

    Tigers-Royals Preview

    - + @@ -1223,8 +1347,10 @@

    Tigers-Royals Preview

    - + - + - + style="display: none;" border="0" height="1" width="1" alt="Quantcast"/> + + - - + + - + + \ No newline at end of file diff --git a/tests/test_article_only.py b/tests/test_article_only.py index 21b1fc03..6b9fce07 100644 --- a/tests/test_article_only.py +++ b/tests/test_article_only.py @@ -34,6 +34,7 @@ def test_si_sample_html_partial(self): """Using the si sample, make sure we can get the article alone.""" sample = load_sample('si-game.sample.html') doc = Document( - sample, url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html') + sample, + url='http://sportsillustrated.cnn.com/baseball/mlb/gameflash/2012/04/16/40630_preview.html') res = doc.summary(html_partial=True) self.assertEqual('
    Date: Thu, 12 Feb 2015 17:16:53 +0600 Subject: [PATCH 5/8] refactoring --- readability/cleaners.py | 6 ++-- readability/debug.py | 5 ++-- readability/htmls.py | 1 - readability/readability.py | 57 +++++++++++++++++++------------------- 4 files changed, 35 insertions(+), 34 deletions(-) diff --git a/readability/cleaners.py b/readability/cleaners.py index 6cfb862d..ae731019 100644 --- a/readability/cleaners.py +++ b/readability/cleaners.py @@ -19,10 +19,10 @@ "(?:%s) *" % ('|'.join(bad_attrs),) + # value '= *(?:%s|%s|%s)' % ( - non_space, single_quoted, double_quoted) + + non_space, single_quoted, double_quoted) + "([^>]*)" # postfix - ">" # end - , re.I) + ">", # end + re.I) def clean_attributes(html): diff --git a/readability/debug.py b/readability/debug.py index 6f02c0c3..c472ed75 100644 --- a/readability/debug.py +++ b/readability/debug.py @@ -4,7 +4,8 @@ def save_to_file(text, filename): f = open(filename, 'wt') f.write( - '') + '' + ) f.write(text.encode('utf-8')) f.close() @@ -23,7 +24,7 @@ def describe(node, depth=2): if name[:4] in ['div#', 'div.']: name = name[3:] if name in ['tr', 'td', 'div', 'p']: - if not node in uids: + if node not in uids: uid = uids[node] = len(uids) + 1 else: uid = uids.get(node) diff --git a/readability/htmls.py b/readability/htmls.py index 90ca4571..c1bf23c5 100644 --- a/readability/htmls.py +++ b/readability/htmls.py @@ -124,5 +124,4 @@ def get_body(doc): # BeautifulSoup(cleaned) #FIXME do we really need to try loading it? return cleaned except Exception: # FIXME find the equivalent lxml error - # logging.error("cleansing broke html content: %s\n---------\n%s" % (raw_html, cleaned)) return raw_html diff --git a/readability/readability.py b/readability/readability.py index 28d211c5..fe6f0e87 100755 --- a/readability/readability.py +++ b/readability/readability.py @@ -21,17 +21,31 @@ logging.basicConfig(level=logging.INFO) log = logging.getLogger() +unlikelyCandidatesReList = ['combx', 'comment', 'community', 'disqus', 'extra', + 'foot', 'header', 'menu', + 'remark', 'rss', 'shoutbox', 'sidebar', 'sponsor', + 'ad-break', 'agegate', + 'pagination', 'pager', 'popup', 'tweet', 'twitter'] + +positiveReList = ['article', 'body', 'content', 'entry', 'hentry', 'main', + 'page', 'pagination', 'post', 'text', 'blog', 'story'] + +negativeReList = [ + 'combx', 'comment', 'com-', 'contact', 'foot', 'footer', 'footnote', + 'masthead', 'media', 'meta', 'outbrain', 'promo', 'related', 'scroll', + 'shoutbox', 'sidebar', 'sponsor', 'shopping', 'tags', 'tool', 'widget', ] + REGEXES = { 'unlikelyCandidatesRe': re.compile( - 'combx|comment|community|disqus|extra|foot|header|menu|remark|rss|shoutbox|sidebar|sponsor|ad-break|agegate|pagination|pager|popup|tweet|twitter', + '|'.join(unlikelyCandidatesReList), re.I), 'okMaybeItsACandidateRe': re.compile('and|article|body|column|main|shadow', re.I), 'positiveRe': re.compile( - 'article|body|content|entry|hentry|main|page|pagination|post|text|blog|story', + '|'.join(positiveReList), re.I), 'negativeRe': re.compile( - 'combx|comment|com-|contact|foot|footer|footnote|masthead|media|meta|outbrain|promo|related|scroll|shoutbox|sidebar|sponsor|shopping|tags|tool|widget', + '|'.join(negativeReList), re.I), 'divToPElementsRe': re.compile( '<(a|blockquote|dl|div|img|ol|p|pre|table|ul)', re.I), @@ -116,8 +130,10 @@ def __init__(self, input, positive_keywords=None, negative_keywords=None, - min_text_length: - retry_length: - url: will allow adjusting links to be absolute - - positive_keywords: the list of positive search patterns in classes and ids, for example: ["news-item", "block"] - - negative_keywords: the list of negative search patterns in classes and ids, for example: ["mysidebar", "related", "ads"] + - positive_keywords: the list of positive search patterns in classes + and ids, for example: ["news-item", "block"] + - negative_keywords: the list of negative search patterns in classes + and ids, for example: ["mysidebar", "related", "ads"] Also positive_keywords and negative_keywords could be a regexp. """ self.input = input @@ -307,7 +323,8 @@ def score_paragraphs(self, ): candidates[parent_node] = self.score_node(parent_node) ordered.append(parent_node) - if grand_parent_node is not None and grand_parent_node not in candidates: + if grand_parent_node is not None and \ + grand_parent_node not in candidates: candidates[grand_parent_node] = self.score_node( grand_parent_node) ordered.append(grand_parent_node) @@ -488,7 +505,8 @@ def sanitize(self, node, candidates): else: content_score = 0 # if parent_node is not None: - # pweight = self.class_weight(parent_node) + content_score + # pweight = self.class_weight(parent_node) + + # + content_score # pname = describe(parent_node) # else: # pweight = 0 @@ -509,7 +527,8 @@ def sanitize(self, node, candidates): to_remove = True elif content_length < (MIN_LEN) and ( counts["img"] == 0 or counts["img"] > 2): - reason = "too short content length %s without a single image" % content_length + reason = "too short content length %s " \ + "without a single image" % content_length to_remove = True elif weight < 25 and link_density > 0.2: reason = "too many links %.3f for its weight %s" % ( @@ -521,27 +540,9 @@ def sanitize(self, node, candidates): to_remove = True elif (counts["embed"] == 1 and content_length < 75) or counts[ "embed"] > 1: - reason = "s with too short content length, or too many s" + reason = "s with too short" \ + " content length, or too many s" to_remove = True - # if el.tag == 'div' and counts['img'] >= 1 and to_remove: - # imgs = el.findall('.//img') - # valid_img = False - # self.debug(tounicode(el)) - # for img in imgs: - # - # height = img.get('height') - # text_length = img.get('text_length') - # self.debug ("height %s text_length %s" %(repr(height), repr(text_length))) - # if to_int(height) >= 100 or to_int(text_length) >= 100: - # valid_img = True - # self.debug("valid image" + tounicode(img)) - # break - # if valid_img: - # to_remove = False - # self.debug("Allowing %s" %el.text_content()) - # for desnode in self.tags(el, "table", "ul", "div"): - # allowed[desnode] = True - # find x non empty preceding and succeeding siblings i, j = 0, 0 x = 1 From 2ceec5d14087faff7feb929063078e556eff5df6 Mon Sep 17 00:00:00 2001 From: warmonger Date: Thu, 12 Feb 2015 17:19:48 +0600 Subject: [PATCH 6/8] refactoring --- readability/readability.py | 50 ++++++++++++++++++++------------------ tests/test_article_only.py | 1 + 2 files changed, 27 insertions(+), 24 deletions(-) diff --git a/readability/readability.py b/readability/readability.py index fe6f0e87..8ed68333 100755 --- a/readability/readability.py +++ b/readability/readability.py @@ -49,14 +49,6 @@ re.I), 'divToPElementsRe': re.compile( '<(a|blockquote|dl|div|img|ol|p|pre|table|ul)', re.I), - # 'replaceBrsRe': re.compile('(]*>[ \n\r\t]*){2,}',re.I), - # 'replaceFontsRe': re.compile('<(\/?)font[^>]*>',re.I), - # 'trimRe': re.compile('^\s+|\s+$/'), - #'normalizeRe': re.compile('\s{2,}/'), - #'killBreaksRe': re.compile('((\s| ?)*){1,}/'), - #'videoRe': re.compile('http:\/\/(www\.)?(youtube|vimeo)\.com', re.I), - # skipFootnoteLink: /^\s*(\[?[a-z0-9]{1,2}\]?|^|edit|citation - # needed)\s*$/i, } @@ -110,10 +102,13 @@ def compile_pattern(elements): return elements if isinstance(elements, basestring): elements = elements.split(',') - return re.compile(u'|'.join([re.escape(x.lower()) for x in elements]), re.U) + return re.compile( + u'|'.join([re.escape(x.lower()) for x in elements]), re.U + ) class Document: + """Class to build a etree document out of html.""" TEXT_LENGTH_THRESHOLD = 25 RETRY_LENGTH = 250 @@ -130,9 +125,10 @@ def __init__(self, input, positive_keywords=None, negative_keywords=None, - min_text_length: - retry_length: - url: will allow adjusting links to be absolute - - positive_keywords: the list of positive search patterns in classes - and ids, for example: ["news-item", "block"] - - negative_keywords: the list of negative search patterns in classes + - positive_keywords: the list of positive search patterns in + classes and ids, for example: ["news-item", "block"] + - negative_keywords: the list of negative + search patterns in classes and ids, for example: ["mysidebar", "related", "ads"] Also positive_keywords and negative_keywords could be a regexp. """ @@ -248,8 +244,8 @@ def get_article(self, candidates, best_candidate, html_partial=False): append = True sibling_key = sibling # HashableElement(sibling) if sibling_key in candidates and \ - candidates[sibling_key][ - 'content_score'] >= sibling_score_threshold: + candidates[sibling_key]['content_score'] >= \ + sibling_score_threshold: append = True if sibling.tag == "p": @@ -277,7 +273,11 @@ def get_article(self, candidates, best_candidate, html_partial=False): def select_best_candidate(self, candidates): sorted_candidates = sorted( - candidates.values(), key=lambda x: x['content_score'], reverse=True) + candidates.values(), + key=lambda x: x['content_score'], + reverse=True + ) + for candidate in sorted_candidates[:5]: elem = candidate['elem'] self.debug("Top 5 : %6.3f %s" % ( @@ -324,7 +324,7 @@ def score_paragraphs(self, ): ordered.append(parent_node) if grand_parent_node is not None and \ - grand_parent_node not in candidates: + grand_parent_node not in candidates: candidates[grand_parent_node] = self.score_node( grand_parent_node) ordered.append(grand_parent_node) @@ -376,11 +376,11 @@ def class_weight(self, e): weight -= 25 if self.positive_keywords and self.positive_keywords.match( - 'tag-' + e.tag): + 'tag-' + e.tag): weight += 25 if self.negative_keywords and self.negative_keywords.match( - 'tag-' + e.tag): + 'tag-' + e.tag): weight -= 25 return weight @@ -413,7 +413,7 @@ def remove_unlikely_candidates(self): # self.debug(s) if REGEXES['unlikelyCandidatesRe'].search(s) and ( not REGEXES['okMaybeItsACandidateRe'].search( - s)) and elem.tag not in ['html', 'body']: + s)) and elem.tag not in ['html', 'body']: self.debug("Removing unlikely candidate - %s" % describe(elem)) elem.drop_tree() @@ -510,7 +510,7 @@ def sanitize(self, node, candidates): # pname = describe(parent_node) # else: # pweight = 0 - #pname = "no parent" + # pname = "no parent" to_remove = False reason = "" @@ -519,14 +519,15 @@ def sanitize(self, node, candidates): if counts["p"] and counts["img"] > counts["p"]: reason = "too many images (%s)" % counts["img"] to_remove = True - elif counts["li"] > counts["p"] and tag != "ul" and tag != "ol": + elif counts["li"] > counts["p"] \ + and tag != "ul" and tag != "ol": reason = "more
  • s than

    s" to_remove = True elif counts["input"] > (counts["p"] / 3): reason = "less than 3x

    s than s" to_remove = True elif content_length < (MIN_LEN) and ( - counts["img"] == 0 or counts["img"] > 2): + counts["img"] == 0 or counts["img"] > 2): reason = "too short content length %s " \ "without a single image" % content_length to_remove = True @@ -539,7 +540,7 @@ def sanitize(self, node, candidates): link_density, weight) to_remove = True elif (counts["embed"] == 1 and content_length < 75) or counts[ - "embed"] > 1: + "embed"] > 1: reason = "s with too short" \ " content length, or too many s" to_remove = True @@ -588,6 +589,7 @@ def sanitize(self, node, candidates): class HashableElement(): + def __init__(self, node): self.node = node self._path = None @@ -649,7 +651,7 @@ def main(): url=options.url, positive_keywords=options.positive_keywords, negative_keywords=options.negative_keywords, - ).summary().encode(enc, 'replace') + ).summary().encode(enc, 'replace') finally: file.close() diff --git a/tests/test_article_only.py b/tests/test_article_only.py index fc3f9941..b255e029 100644 --- a/tests/test_article_only.py +++ b/tests/test_article_only.py @@ -15,6 +15,7 @@ def load_sample(filename): class TestArticleOnly(unittest.TestCase): + """The option to not get back a full html doc should work Given a full html document, the call can request just divs of processed From 297b8cc6a3854995c386f28098c44a1375784dc7 Mon Sep 17 00:00:00 2001 From: warmonger Date: Thu, 12 Feb 2015 17:21:34 +0600 Subject: [PATCH 7/8] refactoring --- .travis.yml | 1 + setup.py | 13 +------------ 2 files changed, 2 insertions(+), 12 deletions(-) diff --git a/.travis.yml b/.travis.yml index d8a19975..51c22f71 100644 --- a/.travis.yml +++ b/.travis.yml @@ -5,6 +5,7 @@ python: - "3.4" install: + - pip install -r requirements.txt - pip install pytest-cov --use-mirrors - pip install pytest-pep8 --use-mirrors - pip install coveralls --use-mirrors diff --git a/setup.py b/setup.py index 9f4bb71d..7872e578 100755 --- a/setup.py +++ b/setup.py @@ -3,17 +3,6 @@ from setuptools import setup - -lxml_requirement = "lxml" -if sys.platform == 'darwin': - import platform - - mac_ver = platform.mac_ver()[0] - mac_ver_no = int(mac_ver.split('.')[1]) - if mac_ver_no < 9: - print "Using lxml<2.4" - lxml_requirement = "lxml<2.4" - setup( name="readability-lxml", version="0.3.0.5", @@ -27,7 +16,7 @@ packages=['readability'], install_requires=[ "chardet", - lxml_requirement + "lxml", ], classifiers=[ "Environment :: Web Environment", From 6a1c779eddd56bf27143d4735a36933e0c30da67 Mon Sep 17 00:00:00 2001 From: warmonger Date: Mon, 23 Feb 2015 20:59:29 +0600 Subject: [PATCH 8/8] Fix. Add tox --- .gitignore | 3 +++ readability/htmls.py | 13 +++++++++---- readability/readability.py | 38 +++++++++++++++++++++++++------------- tox.ini | 14 ++++++++++++++ 4 files changed, 51 insertions(+), 17 deletions(-) create mode 100644 tox.ini diff --git a/.gitignore b/.gitignore index 84fca1f2..16a2c86e 100644 --- a/.gitignore +++ b/.gitignore @@ -9,3 +9,6 @@ dist /man nosetests.xml .coverage +.tox +.idea +.cache diff --git a/readability/htmls.py b/readability/htmls.py index c1bf23c5..729872d4 100644 --- a/readability/htmls.py +++ b/readability/htmls.py @@ -4,16 +4,21 @@ from lxml.html import tostring import lxml.html +import sys -from cleaners import normalize_spaces, clean_attributes -from encoding import get_encoding +from .cleaners import normalize_spaces, clean_attributes +from .encoding import get_encoding utf8_parser = lxml.html.HTMLParser(encoding='utf-8') +if sys.version < '3': + str = unicode + + def build_doc(page): - if isinstance(page, unicode): + if isinstance(page, str): enc = None page_unicode = page else: @@ -118,7 +123,7 @@ def shorten_title(doc): def get_body(doc): [elem.drop_tree() for elem in doc.xpath('.//script | .//link | .//style')] - raw_html = unicode(tostring(doc.body or doc)) + raw_html = str(tostring(doc.body or doc)) cleaned = clean_attributes(raw_html) try: # BeautifulSoup(cleaned) #FIXME do we really need to try loading it? diff --git a/readability/readability.py b/readability/readability.py index 8ed68333..6e03b634 100755 --- a/readability/readability.py +++ b/readability/readability.py @@ -10,12 +10,16 @@ from lxml.html import document_fromstring from lxml.html import fragment_fromstring -from cleaners import clean_attributes -from cleaners import html_cleaner -from htmls import build_doc -from htmls import get_body -from htmls import get_title -from htmls import shorten_title +from .cleaners import clean_attributes +from .cleaners import html_cleaner +from .htmls import build_doc +from .htmls import get_body +from .htmls import get_title +from .htmls import shorten_title + + +if sys.version < '3': + str = unicode logging.basicConfig(level=logging.INFO) @@ -100,8 +104,13 @@ def compile_pattern(elements): return None if isinstance(elements, regexp_type): return elements - if isinstance(elements, basestring): - elements = elements.split(',') + + if sys.version_info.major == 2: + if isinstance(elements, basestring): + elements = elements.split(',') + else: + if isinstance(elements, str): + elements = elements.split(',') return re.compile( u'|'.join([re.escape(x.lower()) for x in elements]), re.U ) @@ -219,9 +228,9 @@ def summary(self, html_partial=False): continue else: return cleaned_article - except StandardError, e: + except Exception as e: log.exception('error getting summary: ') - raise Unparseable(str(e)), None, sys.exc_info()[2] + raise Unparseable(str(e)) def get_article(self, candidates, best_candidate, html_partial=False): # Now that we have the top candidate, look through its siblings for @@ -425,8 +434,11 @@ def transform_misused_divs_into_paragraphs(self): # are not direct children of elem # This results in incorrect results in case there is an # buried within an for example + if not REGEXES['divToPElementsRe'].search( - unicode(''.join(map(tostring, list(elem))))): + str(''.join(map(str, map(tostring, + list(elem)) + )))): # self.debug("Altering %s to p" % (describe(elem))) elem.tag = "p" # print "Fixed element "+describe(elem) @@ -646,12 +658,12 @@ def main(): # XXX: this hack could not always work, better to set PYTHONIOENCODING enc = sys.__stdout__.encoding or 'utf-8' try: - print Document(file.read(), + print(Document(file.read(), debug=options.verbose, url=options.url, positive_keywords=options.positive_keywords, negative_keywords=options.negative_keywords, - ).summary().encode(enc, 'replace') + ).summary().encode(enc, 'replace')) finally: file.close() diff --git a/tox.ini b/tox.ini new file mode 100644 index 00000000..a7234205 --- /dev/null +++ b/tox.ini @@ -0,0 +1,14 @@ +# Tox (http://tox.testrun.org/) is a tool for running tests +# in multiple virtualenvs. This configuration file will run the +# test suite on all supported python versions. To use it, "pip install tox" +# and then run "tox" from this directory. + +[tox] +envlist = py27, py34, py26, py33 + +[testenv] +commands = + pip install -r requirements.txt + pip install pytest-pep8 --use-mirrors + py.test --pep8 readability +