おまけ - つちのこ、のこのこ。(beta)

ラノサイ杯読んだもの一覧 - つちのこ、のこのこ。(はてな番外地)

に貼ったリスト作るのに使った Python スクリプト。(Python 2.5 用)

使い方

http://ippo.dip.jp/lightnovel/lnsite2009first/vote/books?page=1&per_page=-1&view=simple

から取得した HTML を適当な名前でセーブして、

# tableget.py
# HTML の table を TSV(TAB 区切り表形式データ)に
# UTF-8 専用
'''HTML table to TSV.'''

__version__ = '1.0.0'
__date__ = '2007-07-14'
__author__ = 'kadotanimitsuru'

from HTMLParser import HTMLParser, HTMLParseError
from htmlentitydefs import name2codepoint

class Parser(HTMLParser):

    def __init__(self, anchor=True):
        HTMLParser.__init__(self)
        self.tables = []
        self.nest = 0
        self.sel = False
        self.data = None
        self.anchor = anchor

    def _data_store(self):
        if self.data is not None:
            data = ' '.join(''.join(self.data).strip().split())
            self.tables[-1][-1].append(data)
            self.data = None

    def handle_starttag(self, tag, attrs):
        if tag == 'table':
            if self.nest:
                raise HTMLParseError('table nested', self.getpos())
            self.nest += 1
            self.tables.append([])
        elif tag == 'tr':
            self.tables[-1].append([])
        elif tag in ('th', 'td'):
            self.data = []
            self.sel = True
        elif self.sel:
            if self.anchor and tag == 'a':
                self.data.append(' [%s ' % (dict(attrs).get('href', ''),))
            elif tag == 'img':
                self.handle_data(dict(attrs).get('alt', ''))

    def handle_endtag(self, tag):
        if tag == 'table':
            if not self.nest:
                raise HTMLParser.HTMLParseError(
                    self, 'table nested.', self.getpos())
            self.nest -= 1
        elif tag == 'tr':
            self.sel = False
        elif tag in ('th', 'td'):
            self._data_store()
            self.sel = False
        elif self.sel:
            if self.anchor and tag == 'a':
                self.data.append('] ')

    def handle_data(self, data):
        if self.sel:
            self.data.append(data)

    def handle_charref(self, ref):
        if ref[0] in ('x', 'X'):
            i = int(ref[1:], 16)
        else:
            i = int(ref)
        self.handle_data(unichr(i))
        
    def handle_entityref(self, name):
        c = name2codepoint.get(name, '&%s;' % (name,))
        if isinstance(c, int):
            c = unichr(c)
        self.handle_data(c)

    def get_tsv(self):
        tables = []
        for t in self.tables:
            tsv = []
            for line in t:
                tsv.append('\t'.join(line))
            tables.append('\n'.join(tsv))
        return tables


def tableget(html, anchor=True):
    '''List of TSV is returned.

    Arguments:
    html:   Sauce HTML (Unicode or Ascii)
    anchor: Does it leave a text a link?
    '''
    p = Parser(anchor)
    p.feed(html)
    p.close()
    return p.get_tsv()
    
usage = u'''ローカルに置いた HTML ファイルを指定してください。
  HTML ファイルはあらかじめ UTF-8 にしておく必要があります。
  HTML ファイル中の table はネストしてはいけません。
  colspan, rowspan にも対応していません。
  'table?.txt' という形式で表計算ソフト等で扱える TAB 区切りの表が出来ます。
'''

if __name__ == '__main__':
    print usage,
    filename = raw_input('HTML filename:')
    html = open(filename, 'U').read()
    try:
        html = unicode(html, 'utf')
    except UnicodeDecodeError:
        html = unicode(html)
    tables = tableget(html)
    for i, t in enumerate(tables):
        filename = 'table%d.txt' % (i,)
        print filename
        f = open(filename, 'w')
        f.write(t.encode('utf'))
        f.close()

# Public domain. 好きに流用してください

でタブ区切りテキスト(table0.txt)に変換。

そこから手動で読んだ作品の行だけ抜き出して read.txt というファイル名でセーブ。そして、

import os.path

HTML5 = '''<!doctype html>
<html>
<head>
<meta charset="UTF-8">
<title>%s</title>
</head>
<body>
%s
</body>
</html>'''

def tsv2table(tsv):
    table = []
    for l in tsv.strip().splitlines():
        table.append(l.split('\t'))
    return table

def table2tsv(table):
    return '\n'.join(['\t'.join(x) for x in table])

def tohtml(table):
    a = []
    for (
        title,
        author,
        illustrator,
        label,
        edition,
        price,
        date,
        comment,
        link,
        vote) in table:
        ref = link.split('] [')[-1].split()[0]
        a.append(
            u'<li>%(title)s <small>(著者:%(author)s, イラスト:%(illustrator)s) <a href="%(ref)s">%(label)s</a></small></li>'
            % locals())
    return u'''<ol>
%s
</ol>
''' % '\n'.join(a)
        
def tablesort(filename):
    tsv = open(filename, 'U').read()
    try:
        tsv = unicode(tsv, 'utf')
    except UnicodeDecodeError:
        tsv = unicode(tsv)
    table = tsv2table(tsv)
    table.sort(cmp=lambda x,y: cmp(
        (x[1], x[3], [int(i) for i in x[6].split('/')], x[0]),
        (y[1], y[3], [int(i) for i in y[6].split('/')], y[0])))
    tsv = table2tsv(table)
    root, ext = os.path.splitext(filename)
    tsvfile = root + '_sorted' + ext
    htmlfile = root + '.htm'
    print tsvfile, htmlfile
    f = open(tsvfile, 'w')
    f.write(tsv.encode('utf'))
    f.close()
    f = open(htmlfile, 'w')
    f.write((HTML5 % (filename, tohtml(table))).encode('utf'))
    f.close()

if __name__ == '__main__':
    filename = raw_input('tsv filename:')
    tablesort(filename)

# Public domain. 好きに流用してください

を実行。すると read_sorted.txt と read.htm という2つのファイルが出来上がります。read.htm の方がそれなので、あとはその中の ol要素を blog なりに貼りつければ OK。(先のリストはこの出力に更に手動で手を加えてシリーズをまとめて2段リストにしたり未読を斜体にしたりしています)