aboutsummaryrefslogtreecommitdiffstats
path: root/libmproxy/contrib/html2text.py
diff options
context:
space:
mode:
Diffstat (limited to 'libmproxy/contrib/html2text.py')
-rw-r--r--libmproxy/contrib/html2text.py834
1 files changed, 0 insertions, 834 deletions
diff --git a/libmproxy/contrib/html2text.py b/libmproxy/contrib/html2text.py
deleted file mode 100644
index 035a596b..00000000
--- a/libmproxy/contrib/html2text.py
+++ /dev/null
@@ -1,834 +0,0 @@
-#!/usr/bin/env python
-"""html2text: Turn HTML into equivalent Markdown-structured text."""
-__version__ = "3.200.3"
-__author__ = "Aaron Swartz (me@aaronsw.com)"
-__copyright__ = "(C) 2004-2008 Aaron Swartz. GNU GPL 3."
-__contributors__ = ["Martin 'Joey' Schulze", "Ricardo Reyes", "Kevin Jay North"]
-
-# TODO:
-# Support decoded entities with unifiable.
-
-try:
- True
-except NameError:
- setattr(__builtins__, 'True', 1)
- setattr(__builtins__, 'False', 0)
-
-def has_key(x, y):
- if hasattr(x, 'has_key'): return x.has_key(y)
- else: return y in x
-
-try:
- import htmlentitydefs
- import urlparse
- import HTMLParser
-except ImportError: #Python3
- import html.entities as htmlentitydefs
- import urllib.parse as urlparse
- import html.parser as HTMLParser
-try: #Python3
- import urllib.request as urllib
-except:
- import urllib
-import optparse, re, sys, codecs, types
-
-try: from textwrap import wrap
-except: pass
-
-# Use Unicode characters instead of their ascii psuedo-replacements
-UNICODE_SNOB = 0
-
-# Put the links after each paragraph instead of at the end.
-LINKS_EACH_PARAGRAPH = 0
-
-# Wrap long lines at position. 0 for no wrapping. (Requires Python 2.3.)
-BODY_WIDTH = 78
-
-# Don't show internal links (href="#local-anchor") -- corresponding link targets
-# won't be visible in the plain text file anyway.
-SKIP_INTERNAL_LINKS = True
-
-# Use inline, rather than reference, formatting for images and links
-INLINE_LINKS = True
-
-# Number of pixels Google indents nested lists
-GOOGLE_LIST_INDENT = 36
-
-IGNORE_ANCHORS = False
-IGNORE_IMAGES = False
-IGNORE_EMPHASIS = False
-
-### Entity Nonsense ###
-
-def name2cp(k):
- if k == 'apos': return ord("'")
- if hasattr(htmlentitydefs, "name2codepoint"): # requires Python 2.3
- return htmlentitydefs.name2codepoint[k]
- else:
- k = htmlentitydefs.entitydefs[k]
- if k.startswith("&#") and k.endswith(";"): return int(k[2:-1]) # not in latin-1
- return ord(codecs.latin_1_decode(k)[0])
-
-unifiable = {'rsquo':"'", 'lsquo':"'", 'rdquo':'"', 'ldquo':'"',
-'copy':'(C)', 'mdash':'--', 'nbsp':' ', 'rarr':'->', 'larr':'<-', 'middot':'*',
-'ndash':'-', 'oelig':'oe', 'aelig':'ae',
-'agrave':'a', 'aacute':'a', 'acirc':'a', 'atilde':'a', 'auml':'a', 'aring':'a',
-'egrave':'e', 'eacute':'e', 'ecirc':'e', 'euml':'e',
-'igrave':'i', 'iacute':'i', 'icirc':'i', 'iuml':'i',
-'ograve':'o', 'oacute':'o', 'ocirc':'o', 'otilde':'o', 'ouml':'o',
-'ugrave':'u', 'uacute':'u', 'ucirc':'u', 'uuml':'u',
-'lrm':'', 'rlm':''}
-
-unifiable_n = {}
-
-for k in unifiable.keys():
- unifiable_n[name2cp(k)] = unifiable[k]
-
-### End Entity Nonsense ###
-
-def onlywhite(line):
- """Return true if the line does only consist of whitespace characters."""
- for c in line:
- if c is not ' ' and c is not ' ':
- return c is ' '
- return line
-
-def hn(tag):
- if tag[0] == 'h' and len(tag) == 2:
- try:
- n = int(tag[1])
- if n in range(1, 10): return n
- except ValueError: return 0
-
-def dumb_property_dict(style):
- """returns a hash of css attributes"""
- return dict([(x.strip(), y.strip()) for x, y in [z.split(':', 1) for z in style.split(';') if ':' in z]]);
-
-def dumb_css_parser(data):
- """returns a hash of css selectors, each of which contains a hash of css attributes"""
- # remove @import sentences
- importIndex = data.find('@import')
- while importIndex != -1:
- data = data[0:importIndex] + data[data.find(';', importIndex) + 1:]
- importIndex = data.find('@import')
-
- # parse the css. reverted from dictionary compehension in order to support older pythons
- elements = [x.split('{') for x in data.split('}') if '{' in x.strip()]
- try:
- elements = dict([(a.strip(), dumb_property_dict(b)) for a, b in elements])
- except ValueError:
- elements = {} # not that important
-
- return elements
-
-def element_style(attrs, style_def, parent_style):
- """returns a hash of the 'final' style attributes of the element"""
- style = parent_style.copy()
- if 'class' in attrs:
- for css_class in attrs['class'].split():
- css_style = style_def['.' + css_class]
- style.update(css_style)
- if 'style' in attrs:
- immediate_style = dumb_property_dict(attrs['style'])
- style.update(immediate_style)
- return style
-
-def google_list_style(style):
- """finds out whether this is an ordered or unordered list"""
- if 'list-style-type' in style:
- list_style = style['list-style-type']
- if list_style in ['disc', 'circle', 'square', 'none']:
- return 'ul'
- return 'ol'
-
-def google_has_height(style):
- """check if the style of the element has the 'height' attribute explicitly defined"""
- if 'height' in style:
- return True
- return False
-
-def google_text_emphasis(style):
- """return a list of all emphasis modifiers of the element"""
- emphasis = []
- if 'text-decoration' in style:
- emphasis.append(style['text-decoration'])
- if 'font-style' in style:
- emphasis.append(style['font-style'])
- if 'font-weight' in style:
- emphasis.append(style['font-weight'])
- return emphasis
-
-def google_fixed_width_font(style):
- """check if the css of the current element defines a fixed width font"""
- font_family = ''
- if 'font-family' in style:
- font_family = style['font-family']
- if 'Courier New' == font_family or 'Consolas' == font_family:
- return True
- return False
-
-def list_numbering_start(attrs):
- """extract numbering from list element attributes"""
- if 'start' in attrs:
- return int(attrs['start']) - 1
- else:
- return 0
-
-class HTML2Text(HTMLParser.HTMLParser):
- def __init__(self, out=None, baseurl=''):
- HTMLParser.HTMLParser.__init__(self)
-
- # Config options
- self.unicode_snob = UNICODE_SNOB
- self.links_each_paragraph = LINKS_EACH_PARAGRAPH
- self.body_width = BODY_WIDTH
- self.skip_internal_links = SKIP_INTERNAL_LINKS
- self.inline_links = INLINE_LINKS
- self.google_list_indent = GOOGLE_LIST_INDENT
- self.ignore_links = IGNORE_ANCHORS
- self.ignore_images = IGNORE_IMAGES
- self.ignore_emphasis = IGNORE_EMPHASIS
- self.google_doc = False
- self.ul_item_mark = '*'
-
- if out is None:
- self.out = self.outtextf
- else:
- self.out = out
-
- self.outtextlist = [] # empty list to store output characters before they are "joined"
-
- try:
- self.outtext = unicode()
- except NameError: # Python3
- self.outtext = str()
-
- self.quiet = 0
- self.p_p = 0 # number of newline character to print before next output
- self.outcount = 0
- self.start = 1
- self.space = 0
- self.a = []
- self.astack = []
- self.acount = 0
- self.list = []
- self.blockquote = 0
- self.pre = 0
- self.startpre = 0
- self.code = False
- self.br_toggle = ''
- self.lastWasNL = 0
- self.lastWasList = False
- self.style = 0
- self.style_def = {}
- self.tag_stack = []
- self.emphasis = 0
- self.drop_white_space = 0
- self.inheader = False
- self.abbr_title = None # current abbreviation definition
- self.abbr_data = None # last inner HTML (for abbr being defined)
- self.abbr_list = {} # stack of abbreviations to write later
- self.baseurl = baseurl
-
- try: del unifiable_n[name2cp('nbsp')]
- except KeyError: pass
- unifiable['nbsp'] = '&nbsp_place_holder;'
-
-
- def feed(self, data):
- data = data.replace("</' + 'script>", "</ignore>")
- HTMLParser.HTMLParser.feed(self, data)
-
- def handle(self, data):
- self.feed(data)
- self.feed("")
- return self.optwrap(self.close())
-
- def outtextf(self, s):
- self.outtextlist.append(s)
- if s: self.lastWasNL = s[-1] == '\n'
-
- def close(self):
- HTMLParser.HTMLParser.close(self)
-
- self.pbr()
- self.o('', 0, 'end')
-
- self.outtext = self.outtext.join(self.outtextlist)
- if self.unicode_snob:
- nbsp = unichr(name2cp('nbsp'))
- else:
- nbsp = u' '
- self.outtext = self.outtext.replace(u'&nbsp_place_holder;', nbsp)
-
- return self.outtext
-
- def handle_charref(self, c):
- self.o(self.charref(c), 1)
-
- def handle_entityref(self, c):
- self.o(self.entityref(c), 1)
-
- def handle_starttag(self, tag, attrs):
- self.handle_tag(tag, attrs, 1)
-
- def handle_endtag(self, tag):
- self.handle_tag(tag, None, 0)
-
- def previousIndex(self, attrs):
- """ returns the index of certain set of attributes (of a link) in the
- self.a list
-
- If the set of attributes is not found, returns None
- """
- if not has_key(attrs, 'href'): return None
-
- i = -1
- for a in self.a:
- i += 1
- match = 0
-
- if has_key(a, 'href') and a['href'] == attrs['href']:
- if has_key(a, 'title') or has_key(attrs, 'title'):
- if (has_key(a, 'title') and has_key(attrs, 'title') and
- a['title'] == attrs['title']):
- match = True
- else:
- match = True
-
- if match: return i
-
- def drop_last(self, nLetters):
- if not self.quiet:
- self.outtext = self.outtext[:-nLetters]
-
- def handle_emphasis(self, start, tag_style, parent_style):
- """handles various text emphases"""
- tag_emphasis = google_text_emphasis(tag_style)
- parent_emphasis = google_text_emphasis(parent_style)
-
- # handle Google's text emphasis
- strikethrough = 'line-through' in tag_emphasis and self.hide_strikethrough
- bold = 'bold' in tag_emphasis and not 'bold' in parent_emphasis
- italic = 'italic' in tag_emphasis and not 'italic' in parent_emphasis
- fixed = google_fixed_width_font(tag_style) and not \
- google_fixed_width_font(parent_style) and not self.pre
-
- if start:
- # crossed-out text must be handled before other attributes
- # in order not to output qualifiers unnecessarily
- if bold or italic or fixed:
- self.emphasis += 1
- if strikethrough:
- self.quiet += 1
- if italic:
- self.o("_")
- self.drop_white_space += 1
- if bold:
- self.o("**")
- self.drop_white_space += 1
- if fixed:
- self.o('`')
- self.drop_white_space += 1
- self.code = True
- else:
- if bold or italic or fixed:
- # there must not be whitespace before closing emphasis mark
- self.emphasis -= 1
- self.space = 0
- self.outtext = self.outtext.rstrip()
- if fixed:
- if self.drop_white_space:
- # empty emphasis, drop it
- self.drop_last(1)
- self.drop_white_space -= 1
- else:
- self.o('`')
- self.code = False
- if bold:
- if self.drop_white_space:
- # empty emphasis, drop it
- self.drop_last(2)
- self.drop_white_space -= 1
- else:
- self.o("**")
- if italic:
- if self.drop_white_space:
- # empty emphasis, drop it
- self.drop_last(1)
- self.drop_white_space -= 1
- else:
- self.o("_")
- # space is only allowed after *all* emphasis marks
- if (bold or italic) and not self.emphasis:
- self.o(" ")
- if strikethrough:
- self.quiet -= 1
-
- def handle_tag(self, tag, attrs, start):
- #attrs = fixattrs(attrs)
- if attrs is None:
- attrs = {}
- else:
- attrs = dict(attrs)
-
- if self.google_doc:
- # the attrs parameter is empty for a closing tag. in addition, we
- # need the attributes of the parent nodes in order to get a
- # complete style description for the current element. we assume
- # that google docs export well formed html.
- parent_style = {}
- if start:
- if self.tag_stack:
- parent_style = self.tag_stack[-1][2]
- tag_style = element_style(attrs, self.style_def, parent_style)
- self.tag_stack.append((tag, attrs, tag_style))
- else:
- dummy, attrs, tag_style = self.tag_stack.pop()
- if self.tag_stack:
- parent_style = self.tag_stack[-1][2]
-
- if hn(tag):
- self.p()
- if start:
- self.inheader = True
- self.o(hn(tag)*"#" + ' ')
- else:
- self.inheader = False
- return # prevent redundant emphasis marks on headers
-
- if tag in ['p', 'div']:
- if self.google_doc:
- if start and google_has_height(tag_style):
- self.p()
- else:
- self.soft_br()
- else:
- self.p()
-
- if tag == "br" and start: self.o(" \n")
-
- if tag == "hr" and start:
- self.p()
- self.o("* * *")
- self.p()
-
- if tag in ["head", "style", 'script']:
- if start: self.quiet += 1
- else: self.quiet -= 1
-
- if tag == "style":
- if start: self.style += 1
- else: self.style -= 1
-
- if tag in ["body"]:
- self.quiet = 0 # sites like 9rules.com never close <head>
-
- if tag == "blockquote":
- if start:
- self.p(); self.o('> ', 0, 1); self.start = 1
- self.blockquote += 1
- else:
- self.blockquote -= 1
- self.p()
-
- if tag in ['em', 'i', 'u'] and not self.ignore_emphasis: self.o("_")
- if tag in ['strong', 'b'] and not self.ignore_emphasis: self.o("**")
- if tag in ['del', 'strike', 's']:
- if start:
- self.o("<"+tag+">")
- else:
- self.o("</"+tag+">")
-
- if self.google_doc:
- if not self.inheader:
- # handle some font attributes, but leave headers clean
- self.handle_emphasis(start, tag_style, parent_style)
-
- if tag in ["code", "tt"] and not self.pre: self.o('`') #TODO: `` `this` ``
- if tag == "abbr":
- if start:
- self.abbr_title = None
- self.abbr_data = ''
- if has_key(attrs, 'title'):
- self.abbr_title = attrs['title']
- else:
- if self.abbr_title != None:
- self.abbr_list[self.abbr_data] = self.abbr_title
- self.abbr_title = None
- self.abbr_data = ''
-
- if tag == "a" and not self.ignore_links:
- if start:
- if has_key(attrs, 'href') and not (self.skip_internal_links and attrs['href'].startswith('#')):
- self.astack.append(attrs)
- self.o("[")
- else:
- self.astack.append(None)
- else:
- if self.astack:
- a = self.astack.pop()
- if a:
- if self.inline_links:
- self.o("](" + escape_md(a['href']) + ")")
- else:
- i = self.previousIndex(a)
- if i is not None:
- a = self.a[i]
- else:
- self.acount += 1
- a['count'] = self.acount
- a['outcount'] = self.outcount
- self.a.append(a)
- self.o("][" + str(a['count']) + "]")
-
- if tag == "img" and start and not self.ignore_images:
- if has_key(attrs, 'src'):
- attrs['href'] = attrs['src']
- alt = attrs.get('alt', '')
- self.o("![" + escape_md(alt) + "]")
-
- if self.inline_links:
- self.o("(" + escape_md(attrs['href']) + ")")
- else:
- i = self.previousIndex(attrs)
- if i is not None:
- attrs = self.a[i]
- else:
- self.acount += 1
- attrs['count'] = self.acount
- attrs['outcount'] = self.outcount
- self.a.append(attrs)
- self.o("[" + str(attrs['count']) + "]")
-
- if tag == 'dl' and start: self.p()
- if tag == 'dt' and not start: self.pbr()
- if tag == 'dd' and start: self.o(' ')
- if tag == 'dd' and not start: self.pbr()
-
- if tag in ["ol", "ul"]:
- # Google Docs create sub lists as top level lists
- if (not self.list) and (not self.lastWasList):
- self.p()
- if start:
- if self.google_doc:
- list_style = google_list_style(tag_style)
- else:
- list_style = tag
- numbering_start = list_numbering_start(attrs)
- self.list.append({'name':list_style, 'num':numbering_start})
- else:
- if self.list: self.list.pop()
- self.lastWasList = True
- else:
- self.lastWasList = False
-
- if tag == 'li':
- self.pbr()
- if start:
- if self.list: li = self.list[-1]
- else: li = {'name':'ul', 'num':0}
- if self.google_doc:
- nest_count = self.google_nest_count(tag_style)
- else:
- nest_count = len(self.list)
- self.o(" " * nest_count) #TODO: line up <ol><li>s > 9 correctly.
- if li['name'] == "ul": self.o(self.ul_item_mark + " ")
- elif li['name'] == "ol":
- li['num'] += 1
- self.o(str(li['num'])+". ")
- self.start = 1
-
- if tag in ["table", "tr"] and start: self.p()
- if tag == 'td': self.pbr()
-
- if tag == "pre":
- if start:
- self.startpre = 1
- self.pre = 1
- else:
- self.pre = 0
- self.p()
-
- def pbr(self):
- if self.p_p == 0:
- self.p_p = 1
-
- def p(self):
- self.p_p = 2
-
- def soft_br(self):
- self.pbr()
- self.br_toggle = ' '
-
- def o(self, data, puredata=0, force=0):
- if self.abbr_data is not None:
- self.abbr_data += data
-
- if not self.quiet:
- if self.google_doc:
- # prevent white space immediately after 'begin emphasis' marks ('**' and '_')
- lstripped_data = data.lstrip()
- if self.drop_white_space and not (self.pre or self.code):
- data = lstripped_data
- if lstripped_data != '':
- self.drop_white_space = 0
-
- if puredata and not self.pre:
- data = re.sub('\s+', ' ', data)
- if data and data[0] == ' ':
- self.space = 1
- data = data[1:]
- if not data and not force: return
-
- if self.startpre:
- #self.out(" :") #TODO: not output when already one there
- self.startpre = 0
-
- bq = (">" * self.blockquote)
- if not (force and data and data[0] == ">") and self.blockquote: bq += " "
-
- if self.pre:
- bq += " "
- data = data.replace("\n", "\n"+bq)
-
- if self.start:
- self.space = 0
- self.p_p = 0
- self.start = 0
-
- if force == 'end':
- # It's the end.
- self.p_p = 0
- self.out("\n")
- self.space = 0
-
- if self.p_p:
- self.out((self.br_toggle+'\n'+bq)*self.p_p)
- self.space = 0
- self.br_toggle = ''
-
- if self.space:
- if not self.lastWasNL: self.out(' ')
- self.space = 0
-
- if self.a and ((self.p_p == 2 and self.links_each_paragraph) or force == "end"):
- if force == "end": self.out("\n")
-
- newa = []
- for link in self.a:
- if self.outcount > link['outcount']:
- self.out(" ["+ str(link['count']) +"]: " + urlparse.urljoin(self.baseurl, link['href']))
- if has_key(link, 'title'): self.out(" ("+link['title']+")")
- self.out("\n")
- else:
- newa.append(link)
-
- if self.a != newa: self.out("\n") # Don't need an extra line when nothing was done.
-
- self.a = newa
-
- if self.abbr_list and force == "end":
- for abbr, definition in self.abbr_list.items():
- self.out(" *[" + abbr + "]: " + definition + "\n")
-
- self.p_p = 0
- self.out(data)
- self.outcount += 1
-
- def handle_data(self, data):
- if r'\/script>' in data: self.quiet -= 1
-
- if self.style:
- self.style_def.update(dumb_css_parser(data))
-
- self.o(data, 1)
-
- def unknown_decl(self, data): pass
-
- def charref(self, name):
- if name[0] in ['x','X']:
- c = int(name[1:], 16)
- else:
- c = int(name)
-
- if not self.unicode_snob and c in unifiable_n.keys():
- return unifiable_n[c]
- else:
- try:
- return unichr(c)
- except NameError: #Python3
- return chr(c)
-
- def entityref(self, c):
- if not self.unicode_snob and c in unifiable.keys():
- return unifiable[c]
- else:
- try: name2cp(c)
- except KeyError: return "&" + c + ';'
- else:
- try:
- return unichr(name2cp(c))
- except NameError: #Python3
- return chr(name2cp(c))
-
- def replaceEntities(self, s):
- s = s.group(1)
- if s[0] == "#":
- return self.charref(s[1:])
- else: return self.entityref(s)
-
- r_unescape = re.compile(r"&(#?[xX]?(?:[0-9a-fA-F]+|\w{1,8}));")
- def unescape(self, s):
- return self.r_unescape.sub(self.replaceEntities, s)
-
- def google_nest_count(self, style):
- """calculate the nesting count of google doc lists"""
- nest_count = 0
- if 'margin-left' in style:
- nest_count = int(style['margin-left'][:-2]) / self.google_list_indent
- return nest_count
-
-
- def optwrap(self, text):
- """Wrap all paragraphs in the provided text."""
- if not self.body_width:
- return text
-
- assert wrap, "Requires Python 2.3."
- result = ''
- newlines = 0
- for para in text.split("\n"):
- if len(para) > 0:
- if not skipwrap(para):
- for line in wrap(para, self.body_width):
- result += line + "\n"
- result += "\n"
- newlines = 2
- else:
- if not onlywhite(para):
- result += para + "\n"
- newlines = 1
- else:
- if newlines < 2:
- result += "\n"
- newlines += 1
- return result
-
-ordered_list_matcher = re.compile(r'\d+\.\s')
-unordered_list_matcher = re.compile(r'[-\*\+]\s')
-md_chars_matcher = re.compile(r"([\\\[\]\(\)])")
-
-def skipwrap(para):
- # If the text begins with four spaces or one tab, it's a code block; don't wrap
- if para[0:4] == ' ' or para[0] == '\t':
- return True
- # If the text begins with only two "--", possibly preceded by whitespace, that's
- # an emdash; so wrap.
- stripped = para.lstrip()
- if stripped[0:2] == "--" and stripped[2] != "-":
- return False
- # I'm not sure what this is for; I thought it was to detect lists, but there's
- # a <br>-inside-<span> case in one of the tests that also depends upon it.
- if stripped[0:1] == '-' or stripped[0:1] == '*':
- return True
- # If the text begins with a single -, *, or +, followed by a space, or an integer,
- # followed by a ., followed by a space (in either case optionally preceeded by
- # whitespace), it's a list; don't wrap.
- if ordered_list_matcher.match(stripped) or unordered_list_matcher.match(stripped):
- return True
- return False
-
-def wrapwrite(text):
- text = text.encode('utf-8')
- try: #Python3
- sys.stdout.buffer.write(text)
- except AttributeError:
- sys.stdout.write(text)
-
-def html2text(html, baseurl=''):
- h = HTML2Text(baseurl=baseurl)
- return h.handle(html)
-
-def unescape(s, unicode_snob=False):
- h = HTML2Text()
- h.unicode_snob = unicode_snob
- return h.unescape(s)
-
-def escape_md(text):
- """Escapes markdown-sensitive characters."""
- return md_chars_matcher.sub(r"\\\1", text)
-
-def main():
- baseurl = ''
-
- p = optparse.OptionParser('%prog [(filename|url) [encoding]]',
- version='%prog ' + __version__)
- p.add_option("--ignore-emphasis", dest="ignore_emphasis", action="store_true",
- default=IGNORE_EMPHASIS, help="don't include any formatting for emphasis")
- p.add_option("--ignore-links", dest="ignore_links", action="store_true",
- default=IGNORE_ANCHORS, help="don't include any formatting for links")
- p.add_option("--ignore-images", dest="ignore_images", action="store_true",
- default=IGNORE_IMAGES, help="don't include any formatting for images")
- p.add_option("-g", "--google-doc", action="store_true", dest="google_doc",
- default=False, help="convert an html-exported Google Document")
- p.add_option("-d", "--dash-unordered-list", action="store_true", dest="ul_style_dash",
- default=False, help="use a dash rather than a star for unordered list items")
- p.add_option("-b", "--body-width", dest="body_width", action="store", type="int",
- default=BODY_WIDTH, help="number of characters per output line, 0 for no wrap")
- p.add_option("-i", "--google-list-indent", dest="list_indent", action="store", type="int",
- default=GOOGLE_LIST_INDENT, help="number of pixels Google indents nested lists")
- p.add_option("-s", "--hide-strikethrough", action="store_true", dest="hide_strikethrough",
- default=False, help="hide strike-through text. only relevent when -g is specified as well")
- (options, args) = p.parse_args()
-
- # process input
- encoding = "utf-8"
- if len(args) > 0:
- file_ = args[0]
- if len(args) == 2:
- encoding = args[1]
- if len(args) > 2:
- p.error('Too many arguments')
-
- if file_.startswith('http://') or file_.startswith('https://'):
- baseurl = file_
- j = urllib.urlopen(baseurl)
- data = j.read()
- if encoding is None:
- try:
- from feedparser import _getCharacterEncoding as enc
- except ImportError:
- enc = lambda x, y: ('utf-8', 1)
- encoding = enc(j.headers, data)[0]
- if encoding == 'us-ascii':
- encoding = 'utf-8'
- else:
- data = open(file_, 'rb').read()
- if encoding is None:
- try:
- from chardet import detect
- except ImportError:
- detect = lambda x: {'encoding': 'utf-8'}
- encoding = detect(data)['encoding']
- else:
- data = sys.stdin.read()
-
- data = data.decode(encoding)
- h = HTML2Text(baseurl=baseurl)
- # handle options
- if options.ul_style_dash: h.ul_item_mark = '-'
-
- h.body_width = options.body_width
- h.list_indent = options.list_indent
- h.ignore_emphasis = options.ignore_emphasis
- h.ignore_links = options.ignore_links
- h.ignore_images = options.ignore_images
- h.google_doc = options.google_doc
- h.hide_strikethrough = options.hide_strikethrough
-
- wrapwrite(h.handle(data))
-
-
-if __name__ == "__main__":
- main()