theolem.org/xmlparser.py
2020-05-07 18:29:37 +02:00

58 lines
1.6 KiB
Python

from xml.dom import minidom
from html2text import HTML2Text
from slugify import slugify
from datetime import date, datetime
h = HTML2Text()
path = "./content/poems/"
author = 'theo'
category = 'poèmes'
def getTitle(poem):
titleNode = poem.getElementsByTagName("title")[0].firstChild
if titleNode is None :
return ""
return titleNode.data
def getContent(poem):
contentNode = poem.getElementsByTagName("content")[0].firstChild
if contentNode is None :
return ""
return h.handle(contentNode.data)
def getDate(poem):
contentNode = poem.getElementsByTagName("published")[0].firstChild
if contentNode is None :
return None
# format: 2017-09-03T05:59:00.003-07:00
datetime_object = datetime.strptime(contentNode.data.split('T')[0], '%Y-%m-%d')
return datetime_object.strftime("%Y-%m-%d")
def write2file(title, content, date_pub):
slug = slugify(title)
today_date = date.today().strftime("%Y-%m-%d")
f = open( path + slug + '.md', 'w')
print("Title: " + title, file=f)
print("Authors:" + author, file=f)
print("Date: "+ date_pub, file=f)
print("Modified: "+ today_date, file=f)
print("Category: " + category, file=f)
print("Tags: ", file=f)
print("Slug: "+ slug, file=f)
print(content, file=f)
doc = minidom.parse('blog-05-07-2020.xml')
poems = doc.getElementsByTagName("entry")
i = 0
for poem in poems:
title = getTitle(poem)
content = getContent(poem)
date_pub = getDate(poem)
if(title is not None and content is not None and i> 57) :
write2file(title, content, date_pub)
i = i + 1