-
Notifications
You must be signed in to change notification settings - Fork 67
/
Copy pathevents.py
533 lines (431 loc) · 21.4 KB
/
events.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
import datetime
import logging
from legistar.events import LegistarAPIEventScraper
from pupa.scrape import Event, Scraper
from legistar.base import LegistarScraper
LOGGER = logging.getLogger(__name__)
class UnmatchedEventError(Exception):
def __init__(self, events):
message_format = "Can't find companion for Event {0} at {1} on {2} - {3} {4}"
if type(events) is dict:
message = message_format.format(events['EventId'], events['EventTime'], \
events['EventDate'], EventInSiteURL['EventInSiteURL'], '')
elif type(events) is list:
message = ''
for event in events:
temp = message_format.format(event['EventId'], event['EventTime'], \
event['EventDate'], event['EventInSiteURL'], '\n')
message += temp
else:
message = "Can't find companion event"
super().__init__(message)
class LametroEventScraper(LegistarAPIEventScraper, Scraper):
BASE_URL = 'http://webapi.legistar.com/v1/metro'
WEB_URL = 'https://metro.legistar.com/'
EVENTSPAGE = "https://metro.legistar.com/Calendar.aspx"
TIMEZONE = "America/Los_Angeles"
def _pair_events(self, events):
paired_events = []
unpaired_events = {}
for incoming_event in events:
try:
partner_event = unpaired_events[incoming_event.partner_key]
except KeyError:
unpaired_events[incoming_event.key] = incoming_event
else:
del unpaired_events[incoming_event.partner_key]
paired_events.append(incoming_event)
paired_events.append(partner_event)
return paired_events, unpaired_events.values()
def _find_partner(self, event):
'''
Attempt to find other-language partner of an
event. Sometimes English events won't have Spanish
partners, but every Spanish event should have an
English partner.
'''
results = list(self.search('/events/', 'EventId',
event.partner_search_string))
if results:
partner, = results
partner = LAMetroAPIEvent(partner)
assert event.is_partner(partner)
return partner
else:
if event.is_spanish:
LOGGER.critical("Could not find English event partner.")
return None
def api_events(self, *args, **kwargs):
'''
For meetings, Metro provides an English audio recording and
sometimes a Spanish audio translation. Due to limitations with
the InSite system, multiple audio recordings can't be
associated with a single InSite event. So, Metro creates two
InSite event entries for the same actual event, one with the
English audio and the other with the Spanish audio. The Spanish
InSite event entry has the same name as the English event entry,
except the name is suffixed with ' (SAP)'.
We need to merge these companion events. In order to do that,
we must ensure that if we scrape one member of a pair, we also
scrape its partner.
This method subclasses the normal api_event method to ensure
that we get both members of pairs.
'''
partial_scrape = kwargs.get('since_datetime', False)
events = (LAMetroAPIEvent(event) for event
in super().api_events(*args, **kwargs))
paired, unpaired = self._pair_events(events)
yield from paired
for unpaired_event in unpaired:
yield unpaired_event
# if are not getting every single event then it's possible
# that one member of a pair of English and Spanish will
# be included in the our partial scrape and the other
# member won't be. So, we try to find the partners for
# unpaired events.
#
# Spanish broadcasting didn't start until 5/16/2018, so we
# check the date of any unpaired events to make sure they
# should have a pair.
if partial_scrape:
partner_event = self._find_partner(unpaired_event)
spanish_start_date = datetime.datetime(2018, 5, 15, 0, 0, 0, 0)
event_date = datetime.datetime.strptime(unpaired_event['EventDate'], '%Y-%m-%dT%H:%M:%S')
if partner_event is not None:
yield partner_event
elif event_date > spanish_start_date and unpaired_event.is_spanish:
LOGGER.critical("Could not find English event partner.")
def _merge_events(self, events):
english_events = []
spanish_events = {}
for event, web_event in events:
web_event = LAMetroWebEvent(web_event)
if event.is_spanish:
try:
assert event.key not in spanish_events
except AssertionError:
# Don't allow SAP events to be overwritten in the event
# dictionary. If this error is raised, there is more than
# one SAP event for a meeting body on the same day, i.e.,
# our event pairing criteria are too broad. Consider adding
# back event time as a match constraint. See:
# https://github.com/opencivicdata/scrapers-us-municipal/pull/284 &
# https://github.com/opencivicdata/scrapers-us-municipal/pull/309.
raise ValueError('{0} already exists as a key with a value of {1}'.format(event.key, spanish_events[event.key]))
spanish_events[event.key] = (event, web_event)
else:
english_events.append((event, web_event))
for event, web_event in english_events:
event_details = []
event_audio = []
event_details.append({
'url': web_event['Meeting Details']['url'],
'note': 'web',
})
if web_event.has_audio:
event_audio.append(web_event['Meeting video'])
matches = spanish_events.pop(event.partner_key, None)
if matches:
spanish_event, spanish_web_event = matches
event['SAPEventId'] = spanish_event['EventId']
event['SAPEventGuid'] = spanish_event['EventGuid']
event_details.append({
'url': spanish_web_event['Meeting Details']['url'],
'note': 'web (sap)',
})
if spanish_web_event.has_audio:
spanish_web_event['Meeting video']['label'] = 'Audio (SAP)'
event_audio.append(spanish_web_event['Meeting video'])
event['event_details'] = event_details
event['audio'] = event_audio
try:
assert not spanish_events # These should all be merged with an English event.
except AssertionError:
unpaired_events = [event for event, _ in spanish_events.values()]
LOGGER.critical(f"Found {len(unpaired_events)} Spanish event(s) without partners.")
return english_events
def scrape(self, window=None):
if window and float(window) != 0:
n_days_ago = datetime.datetime.utcnow() - datetime.timedelta(float(window))
else:
n_days_ago = None
events = self.events(since_datetime=n_days_ago)
service_councils = set(
sc['BodyId'] for sc in self.search('/bodies/', 'BodyId', 'BodyTypeId eq 70 or BodyTypeId eq 75')
)
for event, web_event in self._merge_events(events):
body_name = event["EventBodyName"]
if 'Board of Directors -' in body_name:
body_name, event_name = [part.strip()
for part
in body_name.split('-')]
elif event['EventBodyId'] in service_councils:
# Don't scrape service council or service council public hearing events.
self.info('Skipping event {0} for {1}'.format(event['EventId'], event['EventBodyName']))
continue
else:
event_name = body_name
# Events can have an EventAgendaStatusName of "Final", "Final Revised",
# and "Final 2nd Revised."
# We classify these events as "passed."
status_name = event['EventAgendaStatusName']
if status_name.startswith('Final'):
status = 'passed'
elif status_name == 'Draft':
status = 'confirmed'
elif status_name == 'Canceled':
status = 'cancelled'
else:
status = 'tentative'
location = event["EventLocation"]
if not location:
# We expect some events to have no location. LA Metro would
# like these displayed in the Councilmatic interface. However,
# OCD requires a value for this field. Add a sane default.
location = 'Not available'
e = Event(event_name,
start_date=event["start"],
description='',
location_name=location,
status=status)
e.pupa_id = str(event['EventId'])
# Metro requires the EventGuid to build out MediaPlayer links.
# Add both the English event GUID, and the Spanish event GUID if
# it exists, to the extras dict.
e.extras = {'guid': event['EventGuid']}
legistar_api_url = self.BASE_URL + '/events/{0}'.format(event['EventId'])
e.add_source(legistar_api_url, note='api')
if event.get('SAPEventGuid'):
e.extras['sap_guid'] = event['SAPEventGuid']
if web_event.has_ecomment:
self.info('Adding eComment link {0} from {1}'.format(web_event['eComment'],
web_event['Meeting Details']['url']))
e.extras['ecomment'] = web_event['eComment']
if 'event_details' in event:
# if there is not a meeting detail page on legistar
# don't capture the agenda data from the API
for item in self.agenda(event):
agenda_item = e.add_agenda_item(item["EventItemTitle"])
if item["EventItemMatterFile"]:
identifier = item["EventItemMatterFile"]
agenda_item.add_bill(identifier)
if item["EventItemAgendaNumber"]:
# To the notes field, add the item number as given in the agenda minutes
agenda_number = item["EventItemAgendaNumber"]
note = "Agenda number, {}".format(agenda_number)
agenda_item['notes'].append(note)
agenda_item['extras']['agenda_number'] = agenda_number
# The EventItemAgendaSequence provides
# the line number of the Legistar agenda grid.
agenda_item['extras']['item_agenda_sequence'] = item['EventItemAgendaSequence']
# Historically, the Legistar system has duplicated the EventItemAgendaSequence,
# resulting in data inaccuracies. The scrape should fail in such cases, until Metro
# cleans the data.
item_agenda_sequences = [item['extras']['item_agenda_sequence'] for item in e.agenda]
if len(item_agenda_sequences) != len(set(item_agenda_sequences)):
error_msg = 'An agenda has duplicate agenda items on the Legistar grid: \
{event_name} on {event_date} ({legistar_api_url}). \
Contact Metro, and ask them to remove the duplicate EventItemAgendaSequence.'
raise ValueError(error_msg.format(event_name=e.name,
event_date=e.start_date.strftime("%B %d, %Y"),
legistar_api_url=legistar_api_url))
e.add_participant(name=body_name,
type="organization")
if event.get('SAPEventId'):
e.add_source(self.BASE_URL + '/events/{0}'.format(event['SAPEventId']),
note='api (sap)')
if event['EventAgendaFile']:
e.add_document(note='Agenda',
url=event['EventAgendaFile'],
media_type="application/pdf",
date=self.to_utc_timestamp(event['EventAgendaLastPublishedUTC']).date())
# in case this event's minutes haven't been approved yet
e.extras['approved_minutes'] = False
if event['EventMinutesFile']:
e.add_document(note='Minutes',
url=event['EventMinutesFile'],
media_type="application/pdf",
date=self.to_utc_timestamp(event['EventMinutesLastPublishedUTC']).date())
elif web_event['Published minutes'] != 'Not\xa0available':
e.add_document(note=web_event['Published minutes']['label'],
url=web_event['Published minutes']['url'],
media_type="application/pdf")
else:
approved_minutes = self.find_approved_minutes(event)
if approved_minutes:
e.add_document(note=approved_minutes['MatterAttachmentName'],
url=approved_minutes['MatterAttachmentHyperlink'],
media_type="application/pdf",
date=self.to_utc_timestamp(approved_minutes['MatterAttachmentLastModifiedUtc']).date())
e.extras['approved_minutes'] = True
for audio in event['audio']:
try:
redirect_url = self.head(audio['url']).headers['Location']
except KeyError:
# In some cases, the redirect URL does not yet
# contain the location of the audio file. Skip
# these events, and retry on next scrape.
continue
# Sometimes if there is an issue getting the Spanish
# audio created, Metro has the Spanish Audio link
# go to the English Audio.
#
# Pupa does not allow the for duplicate media links,
# so we'll ignore the the second media link if it's
# the same as the first media link.
#
# Because of the way that the event['audio'] is created
# the first audio link is always English and the
# second is always Spanish
e.add_media_link(note=audio['label'],
url=redirect_url,
media_type='text/html',
on_duplicate='ignore')
if event['event_details']:
for link in event['event_details']:
e.add_source(**link)
else:
e.add_source('https://metro.legistar.com/Calendar.aspx', note='web')
yield e
def _suppress_item_matter(self, item, agenda_url):
'''
Agenda items in Legistar do not always display links to
associated matter files even if the same agenda item
in the API references a Matter File. The agenda items
we scrape should honor the suppression on the Legistar
agendas.
This is also practical because matter files that are hidden
in the Legistar Agenda do not seem to available for scraping
on Legistar or through the API
'''
if item['EventItemMatterFile'] is not None:
if item['EventItemMatterStatus'] == 'Draft':
suppress = True
elif item['EventItemMatterType'] == 'Closed Session':
suppress = True
else:
suppress = False
if suppress:
item['EventItemMatterFile'] = None
def find_approved_minutes(self, event):
'''
The minutes of some meetings are available as a legislative item
that are approved at the subsequent meeting. This method tries
to find them.
This method is pretty complicated, but if we can get it right
here, it avoids many complicated and expensive queries in the
councilmatic app.
'''
name = event['EventBodyName']
if name not in {'Board of Directors - Regular Board Meeting',
'LA SAFE'}:
return None
# if the event is the future, there won't have been a chance to
# approve the minutes
if event['start'] > datetime.datetime.now(datetime.timezone.utc):
return None
date = event['start'].strftime('%B %-d, %Y')
result = self.search(
'/matters/',
'MatterId',
"MatterBodyId eq {} and substringof('{}', MatterTitle) and substringof('Minutes', MatterTitle)".format(event['EventBodyId'], date))
try:
matter, = result
except ValueError as e:
if 'not enough values' in str(e):
self.warning(
"Couldn't find minutes for the {} meeting of {}."\
.format(name, date))
return None
elif 'too many values to unpack' in str(e):
self.warning(
"Found more than one minutes file for the {} meeting of {}."\
.format(name, date))
return None
else:
raise
attachment_url = self.BASE_URL + '/matters/{}/attachments'.format(matter['MatterId'])
attachments = self.get(attachment_url).json()
if len(attachments) == 0:
raise ValueError('No attachments for the approved minutes matter')
elif len(attachments) == 1:
return attachments[0]
else:
# This dictionary contains a mapping of dates of events known to
# have more than one minutes file attached to the approval matter,
# to the name of the attachment representing the correct minutes
# file.
handled_cases = {
'May 28, 2015': 'Regular Board Meeting Minutes on May 28, 2015',
'September 24, 2020': 'LA SAFE Minutes - September 24, 2020',
'June 24, 2021': 'LA SAFE MINUTES - June 24, 2021',
'December 2, 2021': 'Regular Board Meeting MINUTES - December 2, 2021',
'January 27, 2022': 'Regular Board Meeting MINUTES - January 27, 2022',
'February 24, 2022': 'MINUTES - February 24, 2022 RBM',
'June 23, 2022': 'Regular Board Meeting MINUTES - June 23, 2022'
}
if date in handled_cases:
attachment_name = handled_cases[date]
attachment, = [
each for each in attachments
if each['MatterAttachmentName'] == attachment_name
]
return attachment
else:
try:
attachment, = [
each for each in attachments
if 'minutes' in each['MatterAttachmentName'].lower()
]
except ValueError:
LOGGER.critical(
"More than one attachment for the approved minutes matter"
)
else:
msg = 'More than attachment for minutes matter {0}, using {1}'.format(
matter['MatterId'], attachment['MatterAttachmentName']
)
self.info(msg)
return attachment
class LAMetroAPIEvent(dict):
'''
This class is for adding methods to the API event dict
to faciliate maching events with their other-language
partners.
'''
@property
def is_spanish(self):
return self['EventBodyName'].endswith(' (SAP)')
@property
def _partner_name(self):
if self.is_spanish:
return self['EventBodyName'].rstrip(' (SAP)')
else:
return self['EventBodyName'] + ' (SAP)'
def is_partner(self, other):
return (self._partner_name == other['EventBodyName'] and
self['EventDate'] == other['EventDate'])
@property
def partner_search_string(self):
search_string = "EventBodyName eq '{}'".format(self._partner_name)
search_string += " and EventDate eq datetime'{}'".format(self['EventDate'])
return search_string
@property
def partner_key(self):
return (self._partner_name, self['EventDate'])
@property
def key(self):
return (self['EventBodyName'], self['EventDate'])
class LAMetroWebEvent(dict):
'''
This class is for adding methods to the web event dict
to facilitate labeling and sourcing audio appropriately.
'''
web_scraper = LegistarScraper(retry_attempts=3,
requests_per_minute=0)
@property
def has_audio(self):
return self['Meeting video'] != 'Not\xa0available'
@property
def has_ecomment(self):
return self['eComment'] != 'Not\xa0available'