# -*- coding: utf-8 -*- from __future__ import print_function, unicode_literals import logging import os import re from collections import OrderedDict import docutils import docutils.core import docutils.io from docutils.parsers.rst.languages import get_language as get_docutils_lang from docutils.writers.html4css1 import HTMLTranslator, Writer import six from six import StringIO from six.moves.html_parser import HTMLParser from pelican import rstdirectives # NOQA from pelican import signals from pelican.cache import FileStampDataCacher from pelican.contents import Author, Category, Page, Tag from pelican.utils import SafeDatetime, escape_html, get_date, pelican_open, \ posixize_path try: from markdown import Markdown except ImportError: Markdown = False # NOQA # Metadata processors have no way to discard an unwanted value, so we have # them return this value instead to signal that it should be discarded later. # This means that _filter_discardable_metadata() must be called on processed # metadata dicts before use, to remove the items with the special value. _DISCARD = object() DUPLICATES_DEFINITIONS_ALLOWED = { 'tags': False, 'date': False, 'modified': False, 'status': False, 'category': False, 'author': False, 'save_as': False, 'url': False, 'authors': False, 'slug': False } METADATA_PROCESSORS = { 'tags': lambda x, y: ([ Tag(tag, y) for tag in ensure_metadata_list(x) ] or _DISCARD), 'date': lambda x, y: get_date(x.replace('_', ' ')), 'modified': lambda x, y: get_date(x), 'status': lambda x, y: x.strip() or _DISCARD, 'category': lambda x, y: _process_if_nonempty(Category, x, y), 'author': lambda x, y: _process_if_nonempty(Author, x, y), 'authors': lambda x, y: ([ Author(author, y) for author in ensure_metadata_list(x) ] or _DISCARD), 'slug': lambda x, y: x.strip() or _DISCARD, } logger = logging.getLogger(__name__) def ensure_metadata_list(text): """Canonicalize the format of a list of authors or tags. This works the same way as Docutils' "authors" field: if it's already a list, those boundaries are preserved; otherwise, it must be a string; if the string contains semicolons, it is split on semicolons; otherwise, it is split on commas. This allows you to write author lists in either "Jane Doe, John Doe" or "Doe, Jane; Doe, John" format. Regardless, all list items undergo .strip() before returning, and empty items are discarded. """ if isinstance(text, six.text_type): if ';' in text: text = text.split(';') else: text = text.split(',') return list(OrderedDict.fromkeys( [v for v in (w.strip() for w in text) if v] )) def _process_if_nonempty(processor, name, settings): """Removes extra whitespace from name and applies a metadata processor. If name is empty or all whitespace, returns _DISCARD instead. """ name = name.strip() return processor(name, settings) if name else _DISCARD def _filter_discardable_metadata(metadata): """Return a copy of a dict, minus any items marked as discardable.""" return {name: val for name, val in metadata.items() if val is not _DISCARD} class BaseReader(object): """Base class to read files. This class is used to process static files, and it can be inherited for other types of file. A Reader class must have the following attributes: - enabled: (boolean) tell if the Reader class is enabled. It generally depends on the import of some dependency. - file_extensions: a list of file extensions that the Reader will process. - extensions: a list of extensions to use in the reader (typical use is Markdown). """ enabled = True file_extensions = ['static'] extensions = None def __init__(self, settings): self.settings = settings def process_metadata(self, name, value): if name in METADATA_PROCESSORS: return METADATA_PROCESSORS[name](value, self.settings) return value def read(self, source_path): "No-op parser" content = None metadata = {} return content, metadata class _FieldBodyTranslator(HTMLTranslator): def __init__(self, document): HTMLTranslator.__init__(self, document) self.compact_p = None def astext(self): return ''.join(self.body) def visit_field_body(self, node): pass def depart_field_body(self, node): pass def render_node_to_html(document, node, field_body_translator_class): visitor = field_body_translator_class(document) node.walkabout(visitor) return visitor.astext() class PelicanHTMLWriter(Writer): def __init__(self): Writer.__init__(self) self.translator_class = PelicanHTMLTranslator class PelicanHTMLTranslator(HTMLTranslator): def visit_abbreviation(self, node): attrs = {} if node.hasattr('explanation'): attrs['title'] = node['explanation'] self.body.append(self.starttag(node, 'abbr', '', **attrs)) def depart_abbreviation(self, node): self.body.append('') def visit_image(self, node): # set an empty alt if alt is not specified # avoids that alt is taken from src node['alt'] = node.get('alt', '') return HTMLTranslator.visit_image(self, node) class RstReader(BaseReader): """Reader for reStructuredText files By default the output HTML is written using docutils.writers.html4css1.Writer and translated using a subclass of docutils.writers.html4css1.HTMLTranslator. If you want to override it with your own writer/translator (e.g. a HTML5-based one), pass your classes to these two attributes. Look in the source code for details. writer_class Used for writing contents field_body_translator_class Used for translating metadata such as article summary """ enabled = bool(docutils) file_extensions = ['rst'] writer_class = PelicanHTMLWriter field_body_translator_class = _FieldBodyTranslator class FileInput(docutils.io.FileInput): """Patch docutils.io.FileInput to remove "U" mode in py3. Universal newlines is enabled by default and "U" mode is deprecated in py3. """ def __init__(self, *args, **kwargs): if six.PY3: kwargs['mode'] = kwargs.get('mode', 'r').replace('U', '') docutils.io.FileInput.__init__(self, *args, **kwargs) def __init__(self, *args, **kwargs): super(RstReader, self).__init__(*args, **kwargs) lang_code = self.settings.get('DEFAULT_LANG', 'en') if get_docutils_lang(lang_code): self._language_code = lang_code else: logger.warning("Docutils has no localization for '%s'." " Using 'en' instead.", lang_code) self._language_code = 'en' def _parse_metadata(self, document, source_path): """Return the dict containing document metadata""" formatted_fields = self.settings['FORMATTED_FIELDS'] output = {} if document.first_child_matching_class(docutils.nodes.title) is None: logger.warning( 'Document title missing in file %s: ' 'Ensure exactly one top level section', source_path) for docinfo in document.traverse(docutils.nodes.docinfo): for element in docinfo.children: if element.tagname == 'field': # custom fields (e.g. summary) name_elem, body_elem = element.children name = name_elem.astext() if name in formatted_fields: value = render_node_to_html( document, body_elem, self.field_body_translator_class) else: value = body_elem.astext() elif element.tagname == 'authors': # author list name = element.tagname value = [element.astext() for element in element.children] else: # standard fields (e.g. address) name = element.tagname value = element.astext() name = name.lower() output[name] = self.process_metadata(name, value) return output def _get_publisher(self, source_path): extra_params = {'initial_header_level': '2', 'syntax_highlight': 'short', 'input_encoding': 'utf-8', 'language_code': self._language_code, 'halt_level': 2, 'traceback': True, 'warning_stream': StringIO(), 'embed_stylesheet': False} user_params = self.settings.get('DOCUTILS_SETTINGS') if user_params: extra_params.update(user_params) pub = docutils.core.Publisher( writer=self.writer_class(), source_class=self.FileInput, destination_class=docutils.io.StringOutput) pub.set_components('standalone', 'restructuredtext', 'html') pub.process_programmatic_settings(None, extra_params, None) pub.set_source(source_path=source_path) pub.publish() return pub def read(self, source_path): """Parses restructured text""" pub = self._get_publisher(source_path) parts = pub.writer.parts content = parts.get('body') metadata = self._parse_metadata(pub.document, source_path) metadata.setdefault('title', parts.get('title')) return content, metadata class MarkdownReader(BaseReader): """Reader for Markdown files""" enabled = bool(Markdown) file_extensions = ['md', 'markdown', 'mkd', 'mdown'] def __init__(self, *args, **kwargs): super(MarkdownReader, self).__init__(*args, **kwargs) settings = self.settings['MARKDOWN'] settings.setdefault('extension_configs', {}) settings.setdefault('extensions', []) for extension in settings['extension_configs'].keys(): if extension not in settings['extensions']: settings['extensions'].append(extension) if 'markdown.extensions.meta' not in settings['extensions']: settings['extensions'].append('markdown.extensions.meta') self._source_path = None def _parse_metadata(self, meta): """Return the dict containing document metadata""" formatted_fields = self.settings['FORMATTED_FIELDS'] output = {} for name, value in meta.items(): name = name.lower() if name in formatted_fields: # formatted metadata is special case and join all list values formatted_values = "\n".join(value) # reset the markdown instance to clear any state self._md.reset() formatted = self._md.convert(formatted_values) output[name] = self.process_metadata(name, formatted) elif not DUPLICATES_DEFINITIONS_ALLOWED.get(name, True): if len(value) > 1: logger.warning( 'Duplicate definition of `%s` ' 'for %s. Using first one.', name, self._source_path) output[name] = self.process_metadata(name, value[0]) elif len(value) > 1: # handle list metadata as list of string output[name] = self.process_metadata(name, value) else: # otherwise, handle metadata as single string output[name] = self.process_metadata(name, value[0]) return output def read(self, source_path): """Parse content and metadata of markdown files""" self._source_path = source_path self._md = Markdown(**self.settings['MARKDOWN']) with pelican_open(source_path) as text: content = self._md.convert(text) if hasattr(self._md, 'Meta'): metadata = self._parse_metadata(self._md.Meta) else: metadata = {} return content, metadata class HTMLReader(BaseReader): """Parses HTML files as input, looking for meta, title, and body tags""" file_extensions = ['htm', 'html'] enabled = True class _HTMLParser(HTMLParser): def __init__(self, settings, filename): try: # Python 3.5+ HTMLParser.__init__(self, convert_charrefs=False) except TypeError: HTMLParser.__init__(self) self.body = '' self.metadata = {} self.settings = settings self._data_buffer = '' self._filename = filename self._in_top_level = True self._in_head = False self._in_title = False self._in_body = False self._in_tags = False def handle_starttag(self, tag, attrs): if tag == 'head' and self._in_top_level: self._in_top_level = False self._in_head = True elif tag == 'title' and self._in_head: self._in_title = True self._data_buffer = '' elif tag == 'body' and self._in_top_level: self._in_top_level = False self._in_body = True self._data_buffer = '' elif tag == 'meta' and self._in_head: self._handle_meta_tag(attrs) elif self._in_body: self._data_buffer += self.build_tag(tag, attrs, False) def handle_endtag(self, tag): if tag == 'head': if self._in_head: self._in_head = False self._in_top_level = True elif self._in_head and tag == 'title': self._in_title = False self.metadata['title'] = self._data_buffer elif tag == 'body': self.body = self._data_buffer self._in_body = False self._in_top_level = True elif self._in_body: self._data_buffer += ''.format(escape_html(tag)) def handle_startendtag(self, tag, attrs): if tag == 'meta' and self._in_head: self._handle_meta_tag(attrs) if self._in_body: self._data_buffer += self.build_tag(tag, attrs, True) def handle_comment(self, data): self._data_buffer += ''.format(data) def handle_data(self, data): self._data_buffer += data def handle_entityref(self, data): self._data_buffer += '&{};'.format(data) def handle_charref(self, data): self._data_buffer += '&#{};'.format(data) def build_tag(self, tag, attrs, close_tag): result = '<{}'.format(escape_html(tag)) for k, v in attrs: result += ' ' + escape_html(k) if v is not None: # If the attribute value contains a double quote, surround # with single quotes, otherwise use double quotes. if '"' in v: result += "='{}'".format(escape_html(v, quote=False)) else: result += '="{}"'.format(escape_html(v, quote=False)) if close_tag: return result + ' />' return result + '>' def _handle_meta_tag(self, attrs): name = self._attr_value(attrs, 'name') if name is None: attr_list = ['{}="{}"'.format(k, v) for k, v in attrs] attr_serialized = ', '.join(attr_list) logger.warning("Meta tag in file %s does not have a 'name' " "attribute, skipping. Attributes: %s", self._filename, attr_serialized) return name = name.lower() contents = self._attr_value(attrs, 'content', '') if not contents: contents = self._attr_value(attrs, 'contents', '') if contents: logger.warning( "Meta tag attribute 'contents' used in file %s, should" " be changed to 'content'", self._filename, extra={'limit_msg': "Other files have meta tag " "attribute 'contents' that should " "be changed to 'content'"}) if name == 'keywords': name = 'tags' if name in self.metadata: # if this metadata already exists (i.e. a previous tag with the # same name has already been specified then either convert to # list or append to list if isinstance(self.metadata[name], list): self.metadata[name].append(contents) else: self.metadata[name] = [self.metadata[name], contents] else: self.metadata[name] = contents @classmethod def _attr_value(cls, attrs, name, default=None): return next((x[1] for x in attrs if x[0] == name), default) def read(self, filename): """Parse content and metadata of HTML files""" with pelican_open(filename) as content: parser = self._HTMLParser(self.settings, filename) parser.feed(content) parser.close() metadata = {} for k in parser.metadata: metadata[k] = self.process_metadata(k, parser.metadata[k]) return parser.body, metadata class Readers(FileStampDataCacher): """Interface for all readers. This class contains a mapping of file extensions / Reader classes, to know which Reader class must be used to read a file (based on its extension). This is customizable both with the 'READERS' setting, and with the 'readers_init' signall for plugins. """ def __init__(self, settings=None, cache_name=''): self.settings = settings or {} self.readers = {} self.reader_classes = {} for cls in [BaseReader] + BaseReader.__subclasses__(): if not cls.enabled: logger.debug('Missing dependencies for %s', ', '.join(cls.file_extensions)) continue for ext in cls.file_extensions: self.reader_classes[ext] = cls if self.settings['READERS']: self.reader_classes.update(self.settings['READERS']) signals.readers_init.send(self) for fmt, reader_class in self.reader_classes.items(): if not reader_class: continue self.readers[fmt] = reader_class(self.settings) # set up caching cache_this_level = (cache_name != '' and self.settings['CONTENT_CACHING_LAYER'] == 'reader') caching_policy = cache_this_level and self.settings['CACHE_CONTENT'] load_policy = cache_this_level and self.settings['LOAD_CONTENT_CACHE'] super(Readers, self).__init__(settings, cache_name, caching_policy, load_policy, ) @property def extensions(self): return self.readers.keys() def read_file(self, base_path, path, content_class=Page, fmt=None, context=None, preread_signal=None, preread_sender=None, context_signal=None, context_sender=None): """Return a content object parsed with the given format.""" path = os.path.abspath(os.path.join(base_path, path)) source_path = posixize_path(os.path.relpath(path, base_path)) logger.debug( 'Read file %s -> %s', source_path, content_class.__name__) if not fmt: _, ext = os.path.splitext(os.path.basename(path)) fmt = ext[1:] if fmt not in self.readers: raise TypeError( 'Pelican does not know how to parse %s', path) if preread_signal: logger.debug( 'Signal %s.send(%s)', preread_signal.name, preread_sender) preread_signal.send(preread_sender) reader = self.readers[fmt] metadata = _filter_discardable_metadata(default_metadata( settings=self.settings, process=reader.process_metadata)) metadata.update(path_metadata( full_path=path, source_path=source_path, settings=self.settings)) metadata.update(_filter_discardable_metadata(parse_path_metadata( source_path=source_path, settings=self.settings, process=reader.process_metadata))) reader_name = reader.__class__.__name__ metadata['reader'] = reader_name.replace('Reader', '').lower() content, reader_metadata = self.get_cached_data(path, (None, None)) if content is None: content, reader_metadata = reader.read(path) self.cache_data(path, (content, reader_metadata)) metadata.update(_filter_discardable_metadata(reader_metadata)) if content: # find images with empty alt find_empty_alt(content, path) # eventually filter the content with typogrify if asked so if self.settings['TYPOGRIFY']: from typogrify.filters import typogrify import smartypants # Tell `smartypants` to also replace " HTML entities with # smart quotes. This is necessary because Docutils has already # replaced double quotes with said entities by the time we run # this filter. smartypants.Attr.default |= smartypants.Attr.w def typogrify_wrapper(text): """Ensures ignore_tags feature is backward compatible""" try: return typogrify( text, self.settings['TYPOGRIFY_IGNORE_TAGS']) except TypeError: return typogrify(text) if content: content = typogrify_wrapper(content) if 'title' in metadata: metadata['title'] = typogrify_wrapper(metadata['title']) if 'summary' in metadata: metadata['summary'] = typogrify_wrapper(metadata['summary']) if context_signal: logger.debug( 'Signal %s.send(%s, )', context_signal.name, context_sender) context_signal.send(context_sender, metadata=metadata) return content_class(content=content, metadata=metadata, settings=self.settings, source_path=path, context=context) def find_empty_alt(content, path): """Find images with empty alt Create warnings for all images with empty alt (up to a certain number), as they are really likely to be accessibility flaws. """ imgs = re.compile(r""" (?: # src before alt ]* src=(['"])(.*?)\1 [^\>]* alt=(['"])\3 )|(?: # alt before src ]* alt=(['"])\4 [^\>]* src=(['"])(.*?)\5 ) """, re.X) for match in re.findall(imgs, content): logger.warning( 'Empty alt attribute for image %s in %s', os.path.basename(match[1] + match[5]), path, extra={'limit_msg': 'Other images have empty alt attributes'}) def default_metadata(settings=None, process=None): metadata = {} if settings: for name, value in dict(settings.get('DEFAULT_METADATA', {})).items(): if process: value = process(name, value) metadata[name] = value if 'DEFAULT_CATEGORY' in settings: value = settings['DEFAULT_CATEGORY'] if process: value = process('category', value) metadata['category'] = value if settings.get('DEFAULT_DATE', None) and \ settings['DEFAULT_DATE'] != 'fs': if isinstance(settings['DEFAULT_DATE'], six.string_types): metadata['date'] = get_date(settings['DEFAULT_DATE']) else: metadata['date'] = SafeDatetime(*settings['DEFAULT_DATE']) return metadata def path_metadata(full_path, source_path, settings=None): metadata = {} if settings: if settings.get('DEFAULT_DATE', None) == 'fs': metadata['date'] = SafeDatetime.fromtimestamp( os.stat(full_path).st_mtime) # Apply EXTRA_PATH_METADATA for the source path and the paths of any # parent directories. Sorting EPM first ensures that the most specific # path wins conflicts. epm = settings.get('EXTRA_PATH_METADATA', {}) for path, meta in sorted(epm.items()): # Enforce a trailing slash when checking for parent directories. # This prevents false positives when one file or directory's name # is a prefix of another's. dirpath = os.path.join(path, '') if source_path == path or source_path.startswith(dirpath): metadata.update(meta) return metadata def parse_path_metadata(source_path, settings=None, process=None): r"""Extract a metadata dictionary from a file's path >>> import pprint >>> settings = { ... 'FILENAME_METADATA': r'(?P[^.]*).*', ... 'PATH_METADATA': ... r'(?P[^/]*)/(?P\d{4}-\d{2}-\d{2})/.*', ... } >>> reader = BaseReader(settings=settings) >>> metadata = parse_path_metadata( ... source_path='my-cat/2013-01-01/my-slug.html', ... settings=settings, ... process=reader.process_metadata) >>> pprint.pprint(metadata) # doctest: +ELLIPSIS {'category': , 'date': SafeDatetime(2013, 1, 1, 0, 0), 'slug': 'my-slug'} """ metadata = {} dirname, basename = os.path.split(source_path) base, ext = os.path.splitext(basename) subdir = os.path.basename(dirname) if settings: checks = [] for key, data in [('FILENAME_METADATA', base), ('PATH_METADATA', source_path)]: checks.append((settings.get(key, None), data)) if settings.get('USE_FOLDER_AS_CATEGORY', None): checks.append(('(?P.*)', subdir)) for regexp, data in checks: if regexp and data: match = re.match(regexp, data) if match: # .items() for py3k compat. for k, v in match.groupdict().items(): k = k.lower() # metadata must be lowercase if v is not None and k not in metadata: if process: v = process(k, v) metadata[k] = v return metadata