This repository has been archived by the owner on Apr 24, 2023. It is now read-only.
-
Notifications
You must be signed in to change notification settings - Fork 63
/
util.py
1047 lines (829 loc) · 37.3 KB
/
util.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
import functools
import importlib
import itertools
import json
import logging
import os
import os.path
import subprocess
import time
import uuid
from datetime import datetime
from urllib.parse import urlencode
import numpy
import requests
from retrying import retry
from tests.cook import mesos
logger = logging.getLogger(__name__)
session = importlib.import_module(os.getenv('COOK_SESSION_MODULE', 'requests')).Session()
session.headers['User-Agent'] = f"Cook-Scheduler-Integration-Tests ({session.headers['User-Agent']})"
# default time limit for each individual integration test
# if a test takes more than 10 minutes, it's probably broken
DEFAULT_TEST_TIMEOUT_SECS = 600
# default time limit used by most wait_* utility functions
# 2 minutes should be more than sufficient on most cases
DEFAULT_TIMEOUT_MS = 120000
# Name of our custom HTTP header for user impersonation
IMPERSONATION_HEADER = 'X-Cook-Impersonate'
def continuous_integration():
"""Returns true if the CONTINUOUS_INTEGRATION environment variable is set, as done by Travis-CI."""
return os.getenv('CONTINUOUS_INTEGRATION')
def has_docker_service():
"""Returns true if docker services appear to be available to the testing environment."""
return os.path.exists('/var/run/docker.sock')
# Default user for multi-user test runs
_default_user_name = 'root'
_default_admin_name = 'root'
_default_impersonator_name = 'poser'
def _get_default_user_name():
return os.getenv('USER', _default_user_name)
@functools.lru_cache()
def _test_user_ids():
"""
Get the numeric user suffixes for this test worker.
Returns the range 0 to 1 million if COOK_MAX_TEST_USERS is not set.
If this is a distributed run with a limited number of users,
e.g., 10 per worker, then this function returns range(0, 10) for worker 0,
or range(20, 30) for worker 2.
"""
pytest_worker = os.getenv('PYTEST_XDIST_WORKER')
max_test_users = int(os.getenv('COOK_MAX_TEST_USERS', 0))
if pytest_worker and max_test_users:
pytest_worker_id = int(pytest_worker[2:]) # e.g., "gw4" -> 4
test_user_min_id = max_test_users * pytest_worker_id
test_user_max_id = test_user_min_id + max_test_users
return range(test_user_min_id, test_user_max_id)
else:
return range(1000000)
def _test_user_names(test_name_prefix=None):
"""
Returns a generator of unique test user names, with form {PREFIX}{ID}.
The COOK_TEST_USER_PREFIX environment variable is used by default;
otherwise, the test_name_prefix value is used as the PREFIX.
"""
name_prefix = os.getenv('COOK_TEST_USER_PREFIX', test_name_prefix)
return (f'{name_prefix}{i}' for i in _test_user_ids())
# Shell command used to obtain Kerberos credentials for a given test user
_kerberos_missing_cmd = 'echo "MISSING COOK_KERBEROS_TEST_AUTH_CMD" && exit 1'
_kerberos_auth_cmd = os.getenv('COOK_KERBEROS_TEST_AUTH_CMD', _kerberos_missing_cmd)
class _AuthenticatedUser(object):
"""
Object representing a Cook user, which holds authentication details.
User objects can be used with python's `with` blocks
to conveniently set a user for a sequence of commands.
"""
def __init__(self, name, impersonatee=None):
self.name = name
self.impersonatee = impersonatee
self.previous_impersonatee = None
def impersonating(self, other_user):
other_username = other_user.name if isinstance(other_user, _AuthenticatedUser) else other_user
return type(self)(self.name, impersonatee=other_username)
def __enter__(self):
logger.debug(f'Switching to user {self.name}')
if self.impersonatee:
self.previous_impersonatee = session.headers.get(IMPERSONATION_HEADER)
session.headers[IMPERSONATION_HEADER] = self.impersonatee
def __exit__(self, ex_type, ex_val, ex_trace):
logger.debug(f'Switching back from user {self.name}')
if self.impersonatee:
if self.previous_impersonatee:
session.headers[IMPERSONATION_HEADER] = self.previous_impersonatee
self.previous_impersonatee = None
else:
del session.headers[IMPERSONATION_HEADER]
class _BasicAuthUser(_AuthenticatedUser):
"""
Object representing a Cook user with HTTP Basic Auth credentials.
"""
def __init__(self, name, impersonatee=None):
super().__init__(name, impersonatee)
self.auth = (name, '')
self.previous_auth = None
def __enter__(self):
global session
super().__enter__()
assert self.previous_auth is None
self.previous_auth = session.auth
session.auth = self.auth
def __exit__(self, ex_type, ex_val, ex_trace):
global session
super().__exit__(ex_type, ex_val, ex_trace)
assert self.previous_auth is not None
session.auth = self.previous_auth
self.previous_auth = None
class _KerberosUser(_AuthenticatedUser):
"""
Object representing a Cook user with Kerberos credentials.
"""
def __init__(self, name, impersonatee=None):
super().__init__(name, impersonatee)
self.auth = None
self.auth_token = self._generate_kerberos_ticket_for_user(name)
self.previous_token = None
@functools.lru_cache()
def _generate_kerberos_ticket_for_user(self, username):
"""
Get a Kerberos authentication ticket for the given user.
Depends on COOK_KERBEROS_TEST_AUTH_CMD being set in the environment.
"""
subcommand = (_kerberos_auth_cmd
.replace('{{COOK_USER}}', username)
.replace('{{COOK_SCHEDULER_URL}}', retrieve_cook_url()))
return subprocess.check_output(subcommand, shell=True).rstrip()
def __enter__(self):
global session
super().__enter__()
assert self.previous_token is None
self.previous_token = session.headers.get('Authorization')
session.headers['Authorization'] = self.auth_token
def __exit__(self, ex_type, ex_val, ex_trace):
global session
super().__exit__(ex_type, ex_val, ex_trace)
if self.previous_token is None:
del session.headers['Authorization']
else:
session.headers['Authorization'] = self.previous_token
self.previous_token = None
class UserFactory(object):
"""
Factory object used to create unique user names for a given test function.
Usernames are composed of the test name and and increasing integer values.
"""
def __init__(self, test_handle):
# Select the authentication scheme
if http_basic_auth_enabled():
self.user_class = _BasicAuthUser
elif kerberos_enabled():
self.user_class = _KerberosUser
else:
raise NotImplementedError(f'Unsupported user authentication scheme: {_cook_auth_scheme()}')
# Set up generator for new user objects
if test_handle:
test_id = test_handle.id()
test_base_name = test_id[test_id.rindex('.test_') + 6:].lower()
self.__user_generator = _test_user_names(test_base_name)
def new_user(self):
"""Return a fresh user object."""
return self.user_class(next(self.__user_generator))
def new_users(self, count=None):
"""Return a sequence of `count` fresh user objects."""
return [self.user_class(x) for x in itertools.islice(self.__user_generator, 0, count)]
@functools.lru_cache()
def default(self):
"""Return the default user"""
return self.user_class(_get_default_user_name())
@functools.lru_cache()
def admin(self):
"""Return the administrator user"""
name = os.getenv('COOK_ADMIN_USER_NAME', _default_admin_name)
return self.user_class(name)
@functools.lru_cache()
def impersonator(self):
"""Return the impersonator user"""
name = os.getenv('COOK_IMPERSONATOR_USER_NAME', _default_impersonator_name)
return self.user_class(name)
def multi_cluster_tests_enabled():
"""
Returns true if the COOK_MULTI_CLUSTER environment variable is set,
indicating that multiple cook scheduler instances are running.
"""
return os.getenv('COOK_MULTI_CLUSTER') is not None
@functools.lru_cache()
def _cook_auth_scheme():
"""Get the authentication scheme name from the cook scheduler info endpoint"""
cook_url = retrieve_cook_url()
_wait_for_cook(cook_url)
cook_info = scheduler_info(cook_url)
logger.info(f"Cook's authentication scheme is {cook_info['authentication-scheme']}")
return cook_info['authentication-scheme']
def http_basic_auth_enabled():
"""Returns true if Cook was configured to use the http-basic authentication scheme."""
return 'http-basic' == _cook_auth_scheme()
def kerberos_enabled():
"""Returns true if Cook was configured to use the Kerberos authentication scheme."""
return 'kerberos' == _cook_auth_scheme()
def multi_user_tests_enabled():
"""Returns true if Cook was configured to support multiple users."""
return http_basic_auth_enabled() or kerberos_enabled()
def get_in(dct, *keys):
for key in keys:
if not dct:
return None
try:
dct = dct[key]
except KeyError:
return None
return dct
def is_valid_uuid(uuid_to_test, version=4):
"""
Check if uuid_to_test is a valid UUID.
Parameters
----------
uuid_to_test : str
version : {1, 2, 3, 4}
Returns
-------
`True` if uuid_to_test is a valid UUID, otherwise `False`.
Examples
--------
>>> is_valid_uuid('c9bf9e57-1685-4c89-bafb-ff5af830be8a')
True
>>> is_valid_uuid('c9bf9e58')
False
"""
try:
uuid_obj = uuid.UUID(uuid_to_test, version=version)
except:
return False
return str(uuid_obj) == uuid_to_test
@functools.lru_cache()
def retrieve_cook_url(varname='COOK_SCHEDULER_URL', value='http://localhost:12321'):
cook_url = os.getenv(varname, value)
logger.info('Using cook url %s' % cook_url)
return cook_url
@functools.lru_cache()
def retrieve_mesos_url(varname='MESOS_PORT', value='5050'):
mesos_url = os.getenv('COOK_MESOS_LEADER_URL')
if mesos_url is None:
mesos_port = os.getenv(varname, value)
cook_url = retrieve_cook_url()
_wait_for_cook(cook_url)
mesos_master_hosts = settings(cook_url).get('mesos-master-hosts', ['localhost'])
resp = session.get('http://%s:%s/redirect' % (mesos_master_hosts[0], mesos_port), allow_redirects=False)
if resp.status_code != 307:
raise RuntimeError('Unable to find mesos leader, redirect endpoint returned %d' % resp.status_code)
mesos_url = 'http:%s' % resp.headers['Location']
logger.info(f'Using mesos url {mesos_url}')
return mesos_url
def is_not_blank(in_string):
"""Test if a string is not None NOR empty NOR blank."""
return bool(in_string and in_string.strip())
def get_job_executor_type(cook_url):
"""Returns 'cook' or 'mesos' based on the default executor Cook is configured with."""
return 'cook' if is_not_blank(get_in(settings(cook_url), 'executor', 'command')) else 'mesos'
def is_connection_error(exception):
return isinstance(exception, requests.exceptions.ConnectionError)
@retry(retry_on_exception=is_connection_error, stop_max_delay=240000, wait_fixed=1000)
def _wait_for_cook(cook_url):
logger.debug('Waiting for connection to cook...')
# if connection is refused, an exception will be thrown
session.get(cook_url)
def init_cook_session(*cook_urls):
for cook_url in cook_urls:
_wait_for_cook(cook_url)
if http_basic_auth_enabled():
session.auth = UserFactory(None).default().auth
def settings(cook_url):
return session.get(f'{cook_url}/settings').json()
@functools.lru_cache()
def scheduler_info(cook_url):
resp = session.get(f'{cook_url}/info', auth=None)
assert resp.status_code == 200
return resp.json()
def minimal_job(**kwargs):
job = {
'command': 'echo Default Test Command',
'cpus': float(os.getenv('COOK_DEFAULT_JOB_CPUS', 1.0)),
'max_retries': 1,
'mem': int(os.getenv('COOK_DEFAULT_JOB_MEM_MB', 256)),
'name': 'default_test_job',
'priority': 1,
'uuid': str(uuid.uuid4())
}
job.update(kwargs)
return job
def minimal_jobs(job_count, **kwargs):
"""Build a list of of multiple homogeneous job specifications"""
return [minimal_job(**kwargs) for _ in range(job_count)]
def minimal_group(**kwargs):
"""Build a minimal group spec"""
return dict(uuid=str(uuid.uuid4()), **kwargs)
def submit_jobs(cook_url, job_specs, clones=1, pool=None, **kwargs):
"""
Create and submit multiple jobs, either cloned from a single job spec,
or specified individually in multiple job specs.
Arguments can be manually passed to the scheduler post via kwargs.
"""
if isinstance(job_specs, dict):
job_specs = [job_specs] * clones
def full_spec(spec):
if 'uuid' not in spec:
return minimal_job(**spec)
else:
return spec
jobs = [full_spec(j) for j in job_specs]
request_body = {'jobs': jobs}
if pool:
request_body['pool'] = pool
request_body.update(kwargs)
logger.info(request_body)
resp = session.post(f'{cook_url}/jobs', json=request_body)
return [j['uuid'] for j in jobs], resp
def retry_jobs(cook_url, assert_response=True, use_deprecated_post=False, **kwargs):
"""Retry one or more jobs and/or groups of jobs"""
request_verb = session.post if use_deprecated_post else session.put
response = request_verb(f'{cook_url}/retry', json=kwargs)
if assert_response:
response_info = {'code': response.status_code, 'msg': response.content}
assert response.status_code in (200, 201), response_info
retried_job_count = int(response.text)
# response code 200 OK implies zero retried jobs
assert response.status_code != 200 or retried_job_count == 0, response_info
# response code 201 Created implies non-zero retried jobs
assert response.status_code != 201 or retried_job_count > 0, response_info
return response
def kill_jobs(cook_url, jobs, assert_response=True, expected_status_code=204):
"""Kill one or more jobs"""
params = {'job': [unpack_uuid(j) for j in jobs]}
response = session.delete(f'{cook_url}/rawscheduler', params=params)
if assert_response:
assert expected_status_code == response.status_code, response.content
return response
def kill_groups(cook_url, groups, assert_response=True, expected_status_code=204):
"""Kill one or more groups of jobs"""
params = {'uuid': [unpack_uuid(g) for g in groups]}
response = session.delete(f'{cook_url}/group', params=params)
if assert_response:
assert expected_status_code == response.status_code, response.content
return response
def submit_job(cook_url, pool=None, **kwargs):
"""Create and submit a single job"""
uuids, resp = submit_jobs(cook_url, job_specs=[kwargs], pool=pool)
return uuids[0], resp
def unpack_uuid(entity):
"""Unpack the UUID string from a job spec, or no-op for UUID strings"""
return entity['uuid'] if isinstance(entity, dict) else entity
def __get(cook_url, endpoint, assert_response=False, **kwargs):
"""Makes a GET request to the given root URL and endpoint"""
if 'partial' in kwargs:
kwargs['partial'] = 'true' if (kwargs['partial'] in [True, 'true', '1']) else 'false'
response = session.get(f'{cook_url}/{endpoint}', params=kwargs)
if assert_response:
assert 200 == response.status_code
return response
def query_jobs_via_rawscheduler_endpoint(cook_url, assert_response=False, **kwargs):
"""
Queries cook for a set of jobs, by job and/or instance uuid. The kwargs
passed to this function are sent straight through as query parameters on
the request.
If the job or instance values are dictionaries (e.g., job_specs),
then they are automatically unpacked to get their UUIDs.
"""
for key in ('job', 'instance'):
if key in kwargs:
kwargs[key] = map(unpack_uuid, kwargs[key])
return __get(cook_url, 'rawscheduler', assert_response, **kwargs)
def query_resource(cook_url, resource, assert_response=False, **kwargs):
"""
Queries cook for a set of entities by uuid. The kwargs
passed to this function are sent straight through as
query parameters on the request. If the uuid values are
dictionaries (e.g., job_specs), then they are
automatically unpacked to get their UUIDs.
"""
kwargs['uuid'] = [unpack_uuid(u) for u in kwargs['uuid']]
return __get(cook_url, resource, assert_response, **kwargs)
def query_jobs(cook_url, assert_response=False, **kwargs):
"""Queries cook for a set of jobs by job uuid"""
return query_resource(cook_url, 'jobs', assert_response, **kwargs)
def query_instances(cook_url, assert_response=False, **kwargs):
"""Queries cook for a set of job instances by instance uuid"""
return query_resource(cook_url, 'instances', assert_response, **kwargs)
def query_groups(cook_url, **kwargs):
"""
Queries cook for a set of groups, by groups uuid. The kwargs
passed to this function are sent straight through as query
parameters on the request.
"""
return session.get('%s/group' % cook_url, params=kwargs)
def load_resource(cook_url, resource, uuid, assert_response=True):
"""Loads an entity by UUID using GET /resource/UUID"""
response = session.get(f'{cook_url}/{resource}/{uuid}')
if assert_response:
assert 200 == response.status_code
return response.json()
def load_job(cook_url, job_uuid, assert_response=True):
"""Loads a job by UUID using GET /jobs/UUID"""
return load_resource(cook_url, 'jobs', job_uuid, assert_response)
def load_instance(cook_url, instance_uuid, assert_response=True):
"""Loads a job instance by UUID using GET /instances/UUID"""
return load_resource(cook_url, 'instances', instance_uuid, assert_response)
def wait_until(query, predicate, max_wait_ms=DEFAULT_TIMEOUT_MS, wait_interval_ms=1000):
"""
Block until the predicate is true for the result of the provided query.
`query` is a thunk (nullary callable) that may be called multiple times.
`predicate` is a unary callable that takes the result value of `query`
and returns True if the condition is met, or False otherwise.
See `wait_for_job` for an example of using this method.
"""
@retry(stop_max_delay=max_wait_ms, wait_fixed=wait_interval_ms)
def wait_until_inner():
response = query()
if not predicate(response):
error_msg = "wait_until condition not yet met, retrying..."
logger.debug(error_msg)
raise RuntimeError(error_msg)
else:
logger.info("wait_until condition satisfied")
return response
try:
return wait_until_inner()
except:
final_response = query()
try:
details = final_response.content
except AttributeError:
details = str(final_response)
logger.info(f"Timeout exceeded waiting for condition. Details: {details}")
raise
def all_instances_done(response, accepted_states=('success', 'failed')):
"""
Helper method used with the wait_until function.
Checks a response from query_jobs to see if all jobs and instances have completed.
"""
for job in response.json():
if job['state'] not in accepted_states:
return False
for inst in job['instances']:
if inst['status'] not in accepted_states:
logger.info(f"Job {job['uuid']} instance {inst['task_id']} has unaccepted status {inst['status']}.")
return False
return True
def all_instances_killed(response):
"""
Helper method used with the wait_until function.
Checks a response from query_jobs to see if all jobs and instances have been killed.
"""
return all_instances_done(response, accepted_states=['failed'])
def group_some_job_started(group_response):
"""
Helper method used with the wait_until function.
Checks a response from group_detail_query to see if any job in the group has started.
"""
group = group_response.json()[0]
running_count = group['running']
logger.info(f"Currently {running_count} jobs running in group {group['uuid']}")
return running_count > 0
def group_some_job_done(group_response):
"""
Helper method used with the wait_until function.
Checks a response from group_detail_query to see if any job in the group has completed.
"""
group = group_response.json()[0]
completed_count = group['completed']
logger.info(f"Currently {completed_count} jobs completed in group {group['uuid']}")
return completed_count > 0
def group_detail_query(cook_url, group_uuid, assert_response=True):
"""Get a group with full status details, returning the response object."""
response = query_groups(cook_url, uuid=[group_uuid], detailed='true')
if assert_response:
assert 200 == response.status_code, response.content
return response
def wait_for_job(cook_url, job_id, status, max_wait_ms=DEFAULT_TIMEOUT_MS):
"""Wait for the given job's status to change to the specified value."""
return wait_for_jobs(cook_url, [job_id], status, max_wait_ms)[0]
def wait_for_jobs(cook_url, job_ids, status, max_wait_ms=DEFAULT_TIMEOUT_MS):
def query():
return query_jobs(cook_url, True, uuid=job_ids)
def predicate(resp):
jobs = resp.json()
for job in jobs:
logger.info(f"Job {job['uuid']} has status {job['status']}, expecting {status}.")
return all([job['status'] == status for job in jobs])
response = wait_until(query, predicate, max_wait_ms=max_wait_ms, wait_interval_ms=2000)
return response.json()
def wait_for_exit_code(cook_url, job_id, max_wait_ms=DEFAULT_TIMEOUT_MS):
"""
Wait for the given job's exit_code field to appear.
(Only supported by Cook Executor jobs.)
Returns an up-to-date job description object on success,
and raises an exception if the max_wait_ms wait time is exceeded.
"""
job_id = unpack_uuid(job_id)
def query():
return query_jobs(cook_url, True, uuid=[job_id])
def predicate(resp):
job = resp.json()[0]
if not job['instances']:
logger.info(f"Job {job_id} has no instances.")
else:
inst = job['instances'][0]
if 'exit_code' not in inst:
logger.info(f"Job {job_id} instance {inst['task_id']} has no exit code.")
else:
logger.info(f"Job {job_id} instance {inst['task_id']} has exit code {inst['exit_code']}.")
return True
response = wait_until(query, predicate, max_wait_ms=max_wait_ms)
return response.json()[0]
def wait_for_sandbox_directory(cook_url, job_id):
"""
Wait for the given job's sandbox_directory field to appear.
Returns an up-to-date job description object on success,
and raises an exception if the max_wait_ms wait time is exceeded.
"""
job_id = unpack_uuid(job_id)
cook_settings = settings(cook_url)
cache_ttl_ms = cook_settings['agent-query-cache']['ttl-ms']
sync_interval_ms = cook_settings['sandbox-syncer']['sync-interval-ms']
max_wait_ms = min(4 * max(cache_ttl_ms, sync_interval_ms), 4 * 60 * 1000)
def query():
response = query_jobs(cook_url, True, uuid=[job_id])
return response.json()[0]
def predicate(job):
if not job['instances']:
logger.info(f"Job {job_id} has no instances.")
else:
inst = job['instances'][0]
if 'sandbox_directory' not in inst:
logger.info(f"Job {job_id} instance {inst['task_id']} has no sandbox directory.")
else:
logger.info(
f"Job {job_id} instance {inst['task_id']} has sandbox directory {inst['sandbox_directory']}.")
return True
return wait_until(query, predicate, max_wait_ms=max_wait_ms, wait_interval_ms=250)
def wait_for_end_time(cook_url, job_id, max_wait_ms=DEFAULT_TIMEOUT_MS):
"""
Wait for the given job's end_time field to appear in instance 0.
Returns an up-to-date job description object on success,
and raises an exception if the max_wait_ms wait time is exceeded.
"""
job_id = unpack_uuid(job_id)
def query():
return query_jobs(cook_url, True, uuid=[job_id])
def predicate(resp):
job = resp.json()[0]
if not job['instances']:
logger.info(f"Job {job_id} has no instances.")
else:
inst = job['instances'][0]
if 'end_time' not in inst:
logger.info(f"Job {job_id} instance {inst['task_id']} has no end time.")
else:
logger.info(f"Job {job_id} instance {inst['task_id']} has end_time {inst['end_time']}.")
return True
response = wait_until(query, predicate, max_wait_ms=max_wait_ms)
return response.json()[0]
def wait_for_running_instance(cook_url, job_id, max_wait_ms=DEFAULT_TIMEOUT_MS):
"""Waits for the job with the given job_id to have a running instance"""
job_id = unpack_uuid(job_id)
def query():
return query_jobs(cook_url, True, uuid=[job_id])
def predicate(resp):
job = resp.json()[0]
if not job['instances']:
logger.info(f"Job {job_id} has no instances.")
else:
for inst in job['instances']:
status = inst['status']
logger.info(f"Job {job_id} instance {inst['task_id']} has status {status}, expected running.")
return status == 'running'
response = wait_until(query, predicate, max_wait_ms=max_wait_ms)
return response.json()[0]['instances'][0]
def get_mesos_state(mesos_url):
"""
Queries the state.json from mesos
"""
return session.get('%s/state.json' % mesos_url).json()
def wait_for_output_url(cook_url, job_uuid):
"""
Wait for the output_url for the given job to be populated,
retrying every 5 seconds for a maximum of 2 minutes.
The retries are necessary because currently the Mesos
agent sandbox directories are cached in Cook.
"""
def query():
return load_job(cook_url, job_uuid, assert_response=False)
def predicate(job):
if 'output_url' in job['instances'][0]:
return True
else:
logger.info(f"Job {job['uuid']} had no output_url")
response = wait_until(query, predicate)
return response['instances'][0]
def list_jobs(cook_url, **kwargs):
"""Makes a request to the /list endpoint using the provided kwargs as the query params"""
if 'start_ms' in kwargs:
kwargs['start-ms'] = kwargs.pop('start_ms')
if 'end_ms' in kwargs:
kwargs['end-ms'] = kwargs.pop('end_ms')
query_params = urlencode(kwargs)
resp = session.get('%s/list?%s' % (cook_url, query_params))
return resp
def jobs(cook_url, **kwargs):
"""Makes a request to the /jobs endpoint using the provided kwargs as the query params"""
query_params = urlencode(kwargs, doseq=True)
resp = session.get('%s/jobs?%s' % (cook_url, query_params))
return resp
def contains_job_uuid(jobs, job_uuid):
"""Returns true if jobs contains a job with the given uuid"""
return any(job for job in jobs if job['uuid'] == job_uuid)
def get_executor(agent_state, executor_id):
"""Returns the executor with id executor_id from agent_state"""
for framework in agent_state['frameworks']:
for executor in framework['executors']:
if executor['id'] == executor_id:
return executor
def get_user(cook_url, job_uuid):
"""Retrieves the job corresponding to the given job_uuid and returns the user"""
return load_job(cook_url, job_uuid)['user']
def unscheduled_jobs(cook_url, *job_uuids, partial=None):
"""Retrieves the unscheduled_jobs reasons for the given job_uuid"""
query_params = [('job', u) for u in job_uuids]
if partial is not None:
query_params.append(('partial', partial))
resp = session.get(f'{cook_url}/unscheduled_jobs?{urlencode(query_params)}')
job_reasons = resp.json() if resp.status_code == 200 else []
return job_reasons, resp
def wait_for_instance(cook_url, job_uuid):
"""Waits for the job with the given job_uuid to have a single instance, and returns the instance uuid"""
job = wait_until(lambda: load_job(cook_url, job_uuid), lambda j: len(j['instances']) == 1)
instance = job['instances'][0]
instance['parent'] = job
return instance
def sleep_for_publish_interval(cook_url):
# allow enough time for progress and sandbox updates to be submitted
cook_settings = settings(cook_url)
progress_publish_interval_ms = get_in(cook_settings, 'progress', 'publish-interval-ms')
wait_publish_interval_ms = min(3 * progress_publish_interval_ms, 20000)
time.sleep(wait_publish_interval_ms / 1000.0)
def progress_line(cook_url, percent, message):
"""Simple text replacement of regex string using expected patterns of (\d+), (?: )? and (.*)."""
cook_settings = settings(cook_url)
regex_string = get_in(cook_settings, 'executor', 'default-progress-regex-string')
if not regex_string:
regex_string = 'progress:\s+([0-9]*\.?[0-9]+)($|\s+.*)'
if '([0-9]*\.?[0-9]+)' not in regex_string:
raise Exception(f'([0-9]*\.?[0-9]+) not present in {regex_string} regex string')
if '($|\s+.*)' not in regex_string:
raise Exception(f'($|\s+.*) not present in {regex_string} regex string')
return (regex_string
.replace('([0-9]*\.?[0-9]+)', str(percent))
.replace('($|\s+.*)', str(f' {message}'))
.replace('\s+', ' ')
.replace('\\', ''))
def group_submit_kill_retry(cook_url, retry_failed_jobs_only):
"""
Helper method for integration tests on groups, following these steps:
1) Creates a group of 10 jobs
2) Waits for at least one job to start
3) Kills all the jobs
4) Retries the jobs
5) Waits for at least one job to start (again)
6) Finally kills all the jobs again (clean up)
Returns the job info (json response) for the group's jobs after step 5.
"""
group_spec = minimal_group()
group_uuid = group_spec['uuid']
job_spec = {'group': group_uuid, 'command': f'sleep 1'}
try:
jobs, resp = submit_jobs(cook_url, job_spec, 10, groups=[group_spec])
assert resp.status_code == 201, resp
def group_query():
return group_detail_query(cook_url, group_uuid)
# wait for some job to start
wait_until(group_query, group_some_job_started)
# kill all jobs in the group (and wait for the kill to complete)
logger.info(f'Killing all jobs in group {group_uuid}.')
kill_groups(cook_url, [group_uuid])
def jobs_query():
return query_jobs(cook_url, True, uuid=jobs)
wait_until(jobs_query, all_instances_done)
jobs = query_jobs(cook_url, assert_response=True, uuid=jobs).json()
for job in jobs:
logger.info(f'Job details: {json.dumps(job, sort_keys=True)}')
# retry all jobs in the group
retry_jobs(cook_url, retries=2, groups=[group_uuid], failed_only=retry_failed_jobs_only)
# wait for some job to start
wait_until(group_query, group_some_job_started)
# return final job details to caller for assertion checks
jobs = query_jobs(cook_url, assert_response=True, uuid=jobs).json()
for job in jobs:
logger.info(f'Dumping sandbox files for job {job}')
for instance in job['instances']:
logger.info(f'Dumping sandbox files for instance {instance}')
mesos.dump_sandbox_files(session, instance, job)
return jobs
finally:
# ensure that we don't leave a bunch of jobs running/waiting
kill_groups(cook_url, [group_uuid])
def group_submit_retry(cook_url, command, predicate_statuses, retry_failed_jobs_only=True):
"""
Helper method for integration tests on groups, following these steps:
1) Creates a group of 5 jobs
2) Waits for the job statuses to match those in predicate_statuses
3) Retries the jobs
4) Waits for the job statuses to match those in predicate_statuses (again)
5) Finally kills all the jobs again (clean up)
Returns the job info (json response) for the group's jobs after step 4.
"""
job_count = 5
group_spec = minimal_group()
group_uuid = group_spec['uuid']
job_spec = {'group': group_uuid, 'max_retries': 1, 'command': command}
def group_query():
return group_detail_query(cook_url, group_uuid)
def status_condition(response):
group = response.json()[0]
statuses_map = {x: group[x] for x in predicate_statuses}
status_counts = statuses_map.values()
# for running & waiting, we want at least one running (not all waiting)
not_all_waiting = group['waiting'] != job_count
logger.debug(f"Currently {statuses_map} jobs in group {group['uuid']}")
if not_all_waiting and sum(status_counts) == job_count:
return True
else:
logger.debug(f'Group details: {group}')
jobs = query_jobs(cook_url, assert_response=True, uuid=group['jobs']).json()
for job in jobs:
logger.debug(f'Job details: {json.dumps(job, sort_keys=True)}')
return False
try:
jobs, resp = submit_jobs(cook_url, job_spec, job_count, groups=[group_spec])
assert resp.status_code == 201, resp
# wait for the expected job statuses specified in predicate_statuses
wait_until(group_query, status_condition)
# retry all failed jobs in the group (if any)
retry_jobs(cook_url, increment=1, failed_only=retry_failed_jobs_only, groups=[group_uuid])
# wait again for the expected job statuses specified in predicate_statuses
wait_until(group_query, status_condition)
# return final job details to caller for assertion checks
return query_jobs(cook_url, assert_response=True, uuid=jobs).json()
finally:
# ensure that we don't leave a bunch of jobs running/waiting
kill_groups(cook_url, [group_uuid])
def user_current_usage(cook_url, **kwargs):
"""
Queries cook for a user's current resource usage
based on their currently running jobs.
"""
return session.get('%s/usage' % cook_url, params=kwargs)
def query_queue(cook_url):
"""Get current jobs via the queue endpoint (admin-only)"""
return session.get(f'{cook_url}/queue')
def get_limit(cook_url, limit_type, user, pool=None):
params = {'user': user}
if pool is not None:
params['pool'] = pool
return session.get(f'{cook_url}/{limit_type}', params=params)
def set_limit(cook_url, limit_type, user, mem=None, cpus=None, gpus=None, count=None, reason='testing', pool=None):
"""
Set resource limits for the given user.
The limit_type parameter should be either 'share' or 'quota', specifying which type of limit is being set.
Any subset of the mem, cpus, gpus and count (job-count) limits can be specified.
"""
limits = {}
body = {'user': user, limit_type: limits}
if reason is not None:
body['reason'] = reason
if mem is not None:
limits['mem'] = mem
if cpus is not None:
limits['cpus'] = cpus
if gpus is not None:
limits['gpus'] = gpus
if count is not None:
limits['count'] = count
if pool is not None:
body['pool'] = pool
logger.debug(f'Setting {user} {limit_type} to {limits}: {body}')
return session.post(f'{cook_url}/{limit_type}', json=body)
def reset_limit(cook_url, limit_type, user, reason='testing', pool=None):
"""
Resets resource limits for the given user to the default for the cluster.
The limit_type parameter should be either 'share' or 'quota', specifying which type of limit is being reset.
"""
params = {'user': user}