Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Removes support for plio-analytics and BIGQUERY #310

Merged
merged 26 commits into from
Jan 27, 2022
Merged
Show file tree
Hide file tree
Changes from 4 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
a582ec9
NEW: updated endpoint for listing plios and removed endpoint for list…
dalmia Jan 21, 2022
a988bc0
DEL: removed BIGQUERY support
dalmia Jan 21, 2022
cdb798b
NEW: added endpoint for calculating plio metrics
dalmia Jan 21, 2022
00f9282
FIX: handle the case when only one session
dalmia Jan 21, 2022
713518e
FIX: failing tests
dalmia Jan 24, 2022
3266e24
NEW: test for checking num_views in fetching plio list
dalmia Jan 25, 2022
9ab6179
NEW: more tests for metrics - average watch time and num views
dalmia Jan 25, 2022
4de7586
FIX: merge conflicts with master
dalmia Jan 25, 2022
a8d3111
NEW: tests for retention calculation
dalmia Jan 25, 2022
7184f0f
NEW: tests for question metrics without any answer
dalmia Jan 25, 2022
f21c604
NEW: tests for question metrics when answered have been added
dalmia Jan 25, 2022
fc13c1f
DEL: all traces of plio-analytics
dalmia Jan 25, 2022
eeffa6e
DEL: all traces of plio-analytics
dalmia Jan 25, 2022
8714ec6
DEL: all traces of plio-analytics
dalmia Jan 25, 2022
9fe2426
DEL: all traces of analytics
dalmia Jan 25, 2022
7375ffa
DEL: all traces of analytics
dalmia Jan 25, 2022
9ea8cd5
DOC: added comments
dalmia Jan 25, 2022
41b5f7d
FIX: user IDs hashed
dalmia Jan 25, 2022
d18d62f
FIX: bug when no session with valid retention
dalmia Jan 25, 2022
afc8717
NEW: added test for checking the bug solved in last commit
dalmia Jan 25, 2022
12436ee
merge master
dalmia Jan 25, 2022
09860b4
ENH: moved sql queries to queries.py
dalmia Jan 27, 2022
fad262c
ENH: add unique_viewers directly to the queryset
dalmia Jan 27, 2022
5c3204a
FIX: failing tests
dalmia Jan 27, 2022
79d8d96
FIX: used the correct type + better variable name
dalmia Jan 27, 2022
c6fa4f7
DEL: useless kwargs
dalmia Jan 27, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 0 additions & 20 deletions plio/ordering.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,4 @@
from rest_framework.filters import OrderingFilter
from django.db.models import OuterRef, Subquery, Count
from entries.models import Session
from django.db.models.functions import Coalesce


class CustomOrderingFilter(OrderingFilter):
Expand Down Expand Up @@ -50,23 +47,6 @@ def filter_queryset(self, request, queryset, view):
ordering = self.get_ordering(request, queryset, view)

if ordering:
# if the ordering fields contain "unique_viewers"
if any("unique_viewers" in order_by for order_by in ordering):
# prepare a session queryset which has an annotated field "count_unique_users"
# that holds the count of unique users for every plio in the plio's queryset
plio_session_group = Session.objects.filter(
plio__uuid=OuterRef("uuid")
).values("plio__uuid")

plios_unique_users_count = plio_session_group.annotate(
count_unique_users=Count("user__id", distinct=True)
).values("count_unique_users")

# annotate the plio's queryset with the count of unique users
queryset = queryset.annotate(
unique_viewers=Coalesce(Subquery(plios_unique_users_count), 0)
)

return queryset.order_by(*ordering)

return queryset
96 changes: 85 additions & 11 deletions plio/queries.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,74 @@
from typing import Tuple


def get_plio_latest_sessions_query(plio_uuid: str, schema: str, **kwargs):
"""Returns the most recent sessions for each user for the given plio

:param plio_uuid: The plio to fetch the details for
:type plio_uuid: str
:param schema: The schema from which the tables are to be accessed
:type schema: str
"""
return f"""
WITH summary AS (
SELECT
session.id,
plio.uuid as plio_uuid,
session.watch_time,
session.retention,
ROW_NUMBER() OVER(PARTITION BY session.user_id, session.plio_id
ORDER BY session.id DESC) AS rank
FROM {schema}.session
INNER JOIN {schema}.plio AS plio ON plio.id = session.plio_id
)
SELECT id, watch_time, retention
FROM summary
WHERE rank = 1 AND plio_uuid = '{plio_uuid}'"""


def get_plio_latest_responses_query(schema: str, session_ids: Tuple[int], **kwargs):
dalmia marked this conversation as resolved.
Show resolved Hide resolved
"""
Returns the responses of each user to the given plio based on
their most recent session.

:param schema: The schema from which the tables are to be accessed
:type schema: str
:param session_ids: The database ids corresponding to the most recent session by each user
:type session_ids: Tuple[int]
"""
query = f"""
SELECT
sessionAnswer.id,
session.user_id,
sessionAnswer.answer,
item.type AS item_type,
question.type AS question_type,
question.correct_answer AS question_correct_answer
FROM {schema}.session AS session
INNER JOIN {schema}.session_answer AS sessionAnswer
ON session.id = sessionAnswer.session_id
INNER JOIN {schema}.item AS item
ON item.id=sessionAnswer.item_id
INNER JOIN {schema}.question AS question ON question.item_id = item.id """

# for some reason, when there is only one id, we cannot use the
# tuple form and have to resort to equality
if len(session_ids) == 1:
query += f"WHERE session.id = {session_ids[0]}"
else:
query += f"WHERE session.id IN {session_ids}"

return query


def get_plio_details_query(plio_uuid: str, schema: str, **kwargs):
"""
Returns the details for the given plio

plio_uuid: The plio to fetch the details for.
schema: The schema from which the tables are to be accessed.
:param plio_uuid: The plio to fetch the details for
:type plio_uuid: str
:param schema: The schema from which the tables are to be accessed
:type schema: str
"""
return f"""
SELECT
Expand All @@ -24,9 +89,12 @@ def get_sessions_dump_query(plio_uuid: str, schema: str, mask_user_id: bool = Tr
"""
Returns the dump of all the sessions for the given plio

plio_uuid: The plio to fetch the details for.
schema: The schema from which the tables are to be accessed.
mask_user_id: whether the user id should be masked
:param plio_uuid: The plio to fetch the details for
:type plio_uuid: str
:param schema: The schema from which the tables are to be accessed
:type schema: str
:param mask_user_id: whether the user id should be masked, defaults to True
:type mask_user_id: bool
"""
return f"""
SELECT
Expand All @@ -47,9 +115,12 @@ def get_responses_dump_query(plio_uuid: str, schema: str, mask_user_id: bool = T
"""
Returns the dump of all the session responses for the given plio

plio_uuid: The plio to fetch the details for.
schema: The schema from which the tables are to be accessed.
mask_user_id: whether the user id should be masked
:param plio_uuid: The plio to fetch the details for
:type plio_uuid: str
:param schema: The schema from which the tables are to be accessed
:type schema: str
:param mask_user_id: whether the user id should be masked, defaults to True
:type mask_user_id: bool
"""
return f"""
SELECT
Expand All @@ -75,9 +146,12 @@ def get_events_query(plio_uuid: str, schema: str, mask_user_id: bool = True):
"""
Returns the dump of all events across all sessions for the given plio

plio_uuid: The plio to fetch the details for.
schema: The schema from which the tables are to be accessed.
mask_user_id: whether the user id should be masked
:param plio_uuid: The plio to fetch the details for
:type plio_uuid: str
:param schema: The schema from which the tables are to be accessed
:type schema: str
:param mask_user_id: whether the user id should be masked, defaults to True
:type mask_user_id: bool
"""
return f"""
SELECT
Expand Down
10 changes: 6 additions & 4 deletions plio/tests.py
Original file line number Diff line number Diff line change
Expand Up @@ -264,7 +264,7 @@ def test_user_list_with_plios(self):
)

for index, _ in enumerate(expected_results):
expected_results[index]["num_views"] = 0
expected_results[index]["unique_viewers"] = 0
expected_results[index]["items"] = []

self.assertEqual(
Expand All @@ -289,8 +289,10 @@ def test_listing_plios_returns_unique_num_views(self):
plios = response.data["results"]

# plio 2 will be listed first because it was created later
expected_num_views = [0, 2]
self.assertEqual([plio["num_views"] for plio in plios], expected_num_views)
expected_num_unique_viewers = [0, 2]
self.assertEqual(
[plio["unique_viewers"] for plio in plios], expected_num_unique_viewers
)

def test_guest_can_play_plio(self):
# unset the credentials
Expand Down Expand Up @@ -702,7 +704,7 @@ def test_metrics_num_views_and_average_watch_time(self):
response = self.client.get(
f"/api/v1/plios/{self.plio_1.uuid}/metrics/",
)
self.assertEqual(response.data["num_views"], 2)
self.assertEqual(response.data["unique_viewers"], 2)
self.assertEqual(response.data["average_watch_time"], 35.0)
self.assertEqual(response.data["percent_one_minute_retention"], None)
self.assertEqual(response.data["accuracy"], None)
Expand Down
98 changes: 33 additions & 65 deletions plio/views.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@
from rest_framework.permissions import IsAuthenticated
from rest_framework.pagination import PageNumberPagination
from django.db import connection
from django.db.models import Q, Count
from django.db.models import Q, Count, OuterRef, Subquery
from django.db.models.functions import Coalesce
from django.http import FileResponse

from django_tenants.utils import get_tenant_model
Expand Down Expand Up @@ -38,6 +39,8 @@
get_sessions_dump_query,
get_responses_dump_query,
get_events_query,
get_plio_latest_sessions_query,
get_plio_latest_responses_query,
)
from plio.permissions import PlioPermission
from plio.ordering import CustomOrderingFilter
Expand Down Expand Up @@ -176,28 +179,27 @@ def list(self, request):
queryset = Plio.objects.none()

num_plios = queryset.count()

# add the number of unique viewers to the queryset
plio_session_group = Session.objects.filter(plio__uuid=OuterRef("uuid")).values(
"plio__uuid"
)

plios_unique_users_count = plio_session_group.annotate(
count_unique_users=Count("user__id", distinct=True)
).values("count_unique_users")

# annotate the plio's queryset with the count of unique users
queryset = queryset.annotate(
unique_viewers=Coalesce(Subquery(plios_unique_users_count), 0)
)

queryset = self.filter_queryset(queryset)
page = self.paginate_queryset(queryset.values())

if page is not None:
# find the number of unique views
plio_ids = list(queryset.values_list(flat=True))

unique_user_counts = (
Session.objects.filter(plio__in=plio_ids)
.values("plio")
.annotate(count=Count("user", distinct=True))
).order_by()

plio_id_to_count = {
unique_user_count["plio"]: unique_user_count["count"]
for unique_user_count in unique_user_counts
}

# add the number of unique viewers and items corresponding
# to the plio in each plio object
for index, plio in enumerate(page):
page[index]["num_views"] = plio_id_to_count.get(plio["id"], 0)
# add the items corresponding to the plio in each plio object
for index, _ in enumerate(page):
page[index]["items"] = ItemSerializer(
queryset[index].item_set, many=True
).data
Expand Down Expand Up @@ -367,30 +369,16 @@ def metrics(self, request, uuid):

import numpy as np

query = f"""
WITH summary AS (
SELECT
session.id,
session.plio_id,
session.watch_time,
session.retention,
ROW_NUMBER() OVER(PARTITION BY session.user_id, session.plio_id

ORDER BY session.id DESC) AS rank
FROM {connection.schema_name}.session
)
SELECT id, watch_time, retention
FROM summary
WHERE rank = 1 AND plio_id = {plio.id}
"""
with connection.cursor() as cursor:
cursor.execute(query)
cursor.execute(
get_plio_latest_sessions_query(plio.uuid, connection.schema_name)
)
results = cursor.fetchall()

df = pd.DataFrame(results, columns=["id", "watch_time", "retention"])

# number of unique viewers and average watch time
num_views = len(df)
num_unique_viewers = len(df)
average_watch_time = df["watch_time"].mean()

# retention at one minute
Expand Down Expand Up @@ -423,7 +411,7 @@ def metrics(self, request, uuid):

# checks if a given user has crossed the second mark
percent_one_minute_retention = np.round(
((retention.sum(axis=1) > 0).sum() / num_views) * 100, 2
((retention.sum(axis=1) > 0).sum() / num_unique_viewers) * 100, 2
)

# question-based metrics
Expand All @@ -436,32 +424,12 @@ def metrics(self, request, uuid):
percent_completed = None

else:
query = f"""
SELECT
sessionAnswer.id,
session.user_id,
sessionAnswer.answer,
item.type AS item_type,
question.type AS question_type,
question.correct_answer AS question_correct_answer
FROM {connection.schema_name}.session AS session
INNER JOIN {connection.schema_name}.session_answer AS sessionAnswer
ON session.id = sessionAnswer.session_id
INNER JOIN {connection.schema_name}.item AS item
ON item.id=sessionAnswer.item_id
INNER JOIN {connection.schema_name}.question AS question ON question.item_id = item.id
"""
session_ids = tuple(df["id"])

# for some reason, when there is only one id, we cannot use the
# tuple form and have to resort to equality
if len(session_ids) == 1:
query += f"WHERE session.id = {session_ids[0]}"
else:
query += f"WHERE session.id IN {session_ids}"

with connection.cursor() as cursor:
cursor.execute(query)
cursor.execute(
get_plio_latest_responses_query(
connection.schema_name, tuple(df["id"])
)
)
results = cursor.fetchall()

df = pd.DataFrame(
Expand Down Expand Up @@ -520,7 +488,7 @@ def is_answer_correct(row):
num_correct_list = np.array(num_correct_list)
average_num_answered = round(num_answered_list.mean())
percent_completed = np.round(
100 * (sum(num_answered_list == num_questions) / num_views), 2
100 * (sum(num_answered_list == num_questions) / num_unique_viewers), 2
)

# only use the responses from viewers who have answered at least
Expand All @@ -538,7 +506,7 @@ def is_answer_correct(row):

return Response(
{
"num_views": num_views,
"unique_viewers": num_unique_viewers,
"average_watch_time": average_watch_time,
"percent_one_minute_retention": percent_one_minute_retention,
"accuracy": accuracy,
Expand Down