-
Notifications
You must be signed in to change notification settings - Fork 11
/
fields.py
206 lines (161 loc) · 7.35 KB
/
fields.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
"""
Convenience functions to make consistent field conventions easier.
Per OEP-38, we're using the MySQL-friendly convention of BigInt ID as a
primary key + separate UUID column.
https://open-edx-proposals.readthedocs.io/en/latest/best-practices/oep-0038-Data-Modeling.html
We have helpers to make case sensitivity consistent across backends. MySQL is
case-insensitive by default, SQLite and Postgres are case-sensitive.
"""
from __future__ import annotations
import hashlib
import uuid
from django.db import models
from .collations import MultiCollationMixin
from .validators import validate_utc_datetime
def create_hash_digest(data_bytes: bytes) -> str:
"""
Create a 40-byte, lower-case hex string representation of a hash digest.
The hash digest itself is 20-bytes using BLAKE2b.
DON'T JUST MODIFY THIS HASH BEHAVIOR!!! We use hashing for de-duplication
purposes. If this hash function ever changes, that deduplication will fail
because the hashing behavior won't match what's already in the database.
If we want to change this representation one day, we should create a new
function for that and do the appropriate data migration.
"""
return hashlib.blake2b(data_bytes, digest_size=20).hexdigest()
def case_insensitive_char_field(**kwargs) -> MultiCollationCharField:
"""
Return a case-insensitive ``MultiCollationCharField``.
This means that entries will sort in a case-insensitive manner, and that
unique indexes will be case insensitive, e.g. you would not be able to
insert "abc" and "ABC" into the same table field if you put a unique index
on this field.
You may override any argument that you would normally pass into
``MultiCollationCharField`` (which is itself a subclass of ``CharField``).
"""
# Set our default arguments
final_kwargs = {
"null": False,
"db_collations": {
"sqlite": "NOCASE",
# We're using utf8mb4_unicode_ci to keep MariaDB compatibility,
# since their collation support diverges after this. MySQL is now on
# utf8mb4_0900_ai_ci based on Unicode 9, while MariaDB has
# uca1400_ai_ci based on Unicode 14.
"mysql": "utf8mb4_unicode_ci",
},
}
# Override our defaults with whatever is passed in.
final_kwargs.update(kwargs)
return MultiCollationCharField(**final_kwargs)
def case_sensitive_char_field(**kwargs) -> MultiCollationCharField:
"""
Return a case-sensitive ``MultiCollationCharField``.
This means that entries will sort in a case-sensitive manner, and that
unique indexes will be case sensitive, e.g. "abc" and "ABC" would be
distinct and you would not get a unique constraint violation by adding them
both to the same table field.
You may override any argument that you would normally pass into
``MultiCollationCharField`` (which is itself a subclass of ``CharField``).
"""
# Set our default arguments
final_kwargs = {
"null": False,
"db_collations": {
"sqlite": "BINARY",
"mysql": "utf8mb4_bin",
},
}
# Override our defaults with whatever is passed in.
final_kwargs.update(kwargs)
return MultiCollationCharField(**final_kwargs)
def immutable_uuid_field() -> models.UUIDField:
"""
Stable, randomly-generated UUIDs.
These can be used as stable identifiers by other services that do not share
a database, but you should prefer to make a ForeignKey to the primary (id)
key of the model if you're in the same process.
"""
return models.UUIDField(
default=uuid.uuid4,
blank=False,
null=False,
editable=False,
unique=True,
verbose_name="UUID", # Just makes the Django admin output properly capitalized
)
def key_field(**kwargs) -> MultiCollationCharField:
"""
Externally created Identifier fields.
These will often be local to a particular scope, like within a
LearningPackage. It's up to the application as to whether they're
semantically meaningful or look more machine-generated.
Other apps should *not* make references to these values directly, since
these values may in theory change (even if this is rare in practice).
"""
return case_sensitive_char_field(max_length=500, blank=False, **kwargs)
def hash_field() -> models.CharField:
"""
Holds a hash digest meant to identify a piece of content.
Do not assume that this is secure or globally unique. Accidental collisions
are extremely unlikely, but we don't want to get into a place where someone
can maliciously craft a collision and affect other users.
Use the create_hash_digest function to generate data suitable for this
field.
There are a couple of ways that we could have stored this more efficiently,
but we don't at this time:
1. A BinaryField would be the most space efficient, but Django doesn't
support indexing a BinaryField in a MySQL database.
2. We could make the field case-sensitive and run it through a URL-safe
base64 encoding. But the amount of space this saves vs. the complexity
didn't seem worthwhile, particularly the possibility of case-sensitivity
related bugs.
"""
return models.CharField(
max_length=40,
blank=False,
null=False,
editable=False,
)
def manual_date_time_field() -> models.DateTimeField:
"""
DateTimeField that does not auto-generate values.
The datetimes entered for this field *must be UTC* or it will raise a
ValidationError.
The reason for this convention is that we are often creating many rows of
data in the same transaction. They are semantically being created or
modified "at the same time", even if each individual row is milliseconds
apart. This convention forces the caller to set a datetime up front and pass
it in manually, so all the affected rows have the exact same time. This
makes it easier to see which rows were changed at the same time more easily.
When using these fields, the other conventions from OEP-38 still hold:
* common field names: created, modified
* Django's USE_TZ setting should be True
* Times should be in UTC as a general rule
"""
return models.DateTimeField(
auto_now=False,
auto_now_add=False,
null=False,
validators=[
validate_utc_datetime,
],
)
class MultiCollationCharField(MultiCollationMixin, models.CharField):
"""
CharField subclass with per-database-vendor collation settings.
Django's CharField already supports specifying the database collation, but
that only works with a single value. So there would be no way to say, "Use
utf8mb4_bin for MySQL, and BINARY if we're running SQLite." This is a
problem because we run tests in SQLite (and may potentially run more later).
It's also a problem if we ever want to support other database backends, like
PostgreSQL. Even MariaDB is starting to diverge from MySQL in terms of what
collations are supported.
"""
class MultiCollationTextField(MultiCollationMixin, models.TextField):
"""
TextField subclass with per-database-vendor collation settings.
We don't ever really want to _sort_ by a TextField, but setting a collation
forces the compatible charset to be set in MySQL, and that's the part that
matters for our purposes.
"""