mensahl.py


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126

#!/usr/bin/env python3
# -*- coding: utf-8 -*-

import html.parser
import json
import re
import sys

from datetime import datetime, timedelta
from urllib.request import urlopen


### Config ###
mensa_url = 'http://www.uni-kiel.de/stwsh/seiten_essen/plan_mensa_luebeck.html'
encoding = 'ISO-8859-1'
meal_types = ['Eintopf', 'Hauptgericht', 'Vegetarisches Hauptgericht', 'Beilagen']
### Config end ###


class Mensa:
	def __init__(self):
		self._HTMLparser = html.parser.HTMLParser()

		self._re_data_match = re.compile(r'^.*?<table[^>]*>.*?<td[^>]*>.*?(\d+)\.(\d+)\. ?- ?\d+\.\d+\.(\d+).*?</td>.*?<td[^>]*>Freitag</td>\s*</tr>(.*?)</table>.*$', re.DOTALL|re.IGNORECASE)
		self._re_day_match = re.compile(r'.*?<tr[^>]*>(.*?)</tr>.*?<tr[^>]*>(.*?)</tr>', re.DOTALL|re.IGNORECASE)
		self._re_meal_match = re.compile(r'.*?<td[^>]*>(.*?)</td>', re.DOTALL|re.IGNORECASE)

	def _unescape_strip(self, s):
		s = self._HTMLparser.unescape(s)
		s = s.replace('<br />', ' ')
		s = re.sub(r'-\s+', '-', s)
		s = re.sub(r'\s+', ' ', s)
		s = re.sub(r'([a-zäöüß])-([a-zäöüß])', r'\1\2', s)
		s = re.sub(r'<img[^>]*logo_bio[^>]*>', 'Bio', s)
		s = re.sub(r'<[^>]*>', '', s)
		return s.strip()

	def _handle_data_row(self, data, pos):
		match = self._re_day_match.match(data, pos)

		if not match:
			return None

		ret = []
		mealpos = 0
		pricepos = 0

		for day in range(0,5):
			mealmatch = self._re_meal_match.match(match.group(1), mealpos)
			pricematch = self._re_meal_match.match(match.group(2), pricepos)

			if not mealmatch or not pricematch:
				return None

			ret.append((self._unescape_strip(mealmatch.group(1)), self._unescape_strip(pricematch.group(1))))
			mealpos = mealmatch.end()
			pricepos = pricematch.end()

		return (ret, match.end())

	def handle_data(self, data):
		match = self._re_data_match.match(data)

		if not match:
			return None

		firstday = datetime.strptime('%s-%s-%s' % (match.group(3), match.group(2), match.group(1)), '%Y-%m-%d').date()

		data = match.group(4)

		meals = []
		pos = 0

		for i in range(0, len(meal_types)):
			row = self._handle_data_row(data, pos)

			if not row:
				break

			meals.append(row[0])
			pos = row[1]

		if len(meals) == 0:
			return None

		while len(meals) < len(meal_types):
			meals.append([[None], [None], [None], [None], [None]])

		ret = []

		for weekday in range(0, 5):
			mealdict = dict()

			for i in range(0, len(meal_types)):
				if meals[i][weekday][0]:
					mealdict[meal_types[i]] = {
						'name' : meals[i][weekday][0],
						'price' : meals[i][weekday][1],
						}

			if (len(mealdict) > 0) or True:
				daydict = dict()
				daydict['date'] = (firstday + timedelta(weekday)).strftime('%Y-%m-%d')
				daydict['meals'] = mealdict

				ret.append(daydict)

		return ret


if __name__ == '__main__':
	try:
		data = urlopen(mensa_url, None, 20).read().decode(encoding)
	except:
		data = None

	if not data:
		print("Could not read Mensa data", file=sys.stderr)
		sys.exit(1)

	meals = Mensa().handle_data(data)
	if not meals:
		print("No or invalid Mensa data was returned", file=sys.stderr)
		sys.exit(1)

	json.dump(meals, sys.stdout, ensure_ascii=False)