-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy path20391420 editable.rtf
119 lines (109 loc) · 7.57 KB
/
20391420 editable.rtf
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
{\rtf1\ansi\ansicpg1252\cocoartf2513
\cocoatextscaling0\cocoaplatform0{\fonttbl\f0\fnil\fcharset0 HelveticaNeue-Bold;\f1\fnil\fcharset0 HelveticaNeue;\f2\fnil\fcharset0 Menlo-Regular;
\f3\fnil\fcharset0 Menlo-Bold;\f4\froman\fcharset0 Times-Roman;\f5\froman\fcharset0 Times-Bold;
}
{\colortbl;\red255\green255\blue255;\red0\green0\blue0;}
{\*\expandedcolortbl;;\cssrgb\c0\c0\c0;}
{\*\listtable{\list\listtemplateid1\listhybrid{\listlevel\levelnfc1\levelnfcn1\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace360\levelindent0{\*\levelmarker \{upper-roman\}.}{\leveltext\leveltemplateid1\'02\'00.;}{\levelnumbers\'01;}\fi-360\li720\lin720 }{\listlevel\levelnfc3\levelnfcn3\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace360\levelindent0{\*\levelmarker \{upper-alpha\}.}{\leveltext\leveltemplateid2\'02\'01.;}{\levelnumbers\'01;}\fi-360\li1440\lin1440 }{\listlevel\levelnfc0\levelnfcn0\leveljc0\leveljcn0\levelfollow0\levelstartat1\levelspace360\levelindent0{\*\levelmarker \{decimal\}.}{\leveltext\leveltemplateid3\'02\'02.;}{\levelnumbers\'01;}\fi-360\li2160\lin2160 }{\listname ;}\listid1}}
{\*\listoverridetable{\listoverride\listid1\listoverridecount0\ls1}}
\margl1440\margr1440
\deftab720
\pard\pardeftab720\partightenfactor0
\f0\b\fs36 \cf2 \up0 \nosupersub \ulnone 20 QUERIES\
\pard\pardeftab720\partightenfactor0
\f1\b0\fs24 \cf2 WARD - 20391420- UC IRVINE - CS 121 - F2020\
\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardeftab20\partightenfactor0
\f2 \cf2 [\
"MOSFET", # Term that Does not Exist in Dataset\
"Dingo ate me baby", # Query with terms that exist in dataset, but whole phrase does not\
#From Top Common Words\
"support document",\
"browser",\
"sourcer",\
"cbcl",\
# From Least Common Words\
"lawks",\
"lawler",\
"lave-man",\
# Query From Bold / Heading / Title\
"breast cancer wisconsin"\
"language for distributed embedded systems"\
"Ai club"\
# Long Queries\
the university of california irvine ai club workshop\
# Assumedly Common Words (more matching documents)\
"master of software engineering",\
"computer science",\
"informatics",\
"computable plant",\
"a", # One Letter Query\
"krisberg org" # Query Of Common and Least Common Term\
"{\field{\*\fldinst{HYPERLINK "mailto:kovarik@mcmail.cis.mcmaster.ca"}}{\fldrslt \ul kovarik@mcmail.cis.mcmaster.ca}}", # Long single word\
]\
\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardeftab20\partightenfactor0
\f3\b \cf2 \
\pard\pardeftab720\partightenfactor0
\f0\fs32 \cf2 \
POOREST PERFORMERS\
\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardeftab20\partightenfactor0
\f2\b0\fs20 \cf2 the university of california irvine ai club workshop 1259.263499999996ms\
the university of california irvine ai club workshop 653.0355000000014ms\
sourcer 432.52720000000267-ms\
computer science 608.8585999999978ms 424.3296000000001ms\
a 638.1358000000006ms\
master of software engineering 295.1747000000005ms\
\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardeftab20\partightenfactor0
\f4\fs24 \cf2 \
\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardeftab20\partightenfactor0
\f2 \cf2 \
\
*** In final performance they were all brought down to around 50ms and certainly below 300ms.***\
\pard\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardeftab20\partightenfactor0
\f4 \cf2 \
\pard\pardeftab720\partightenfactor0
\f1\fs40 \cf2 Methods of Improvement\
\pard\tx20\tx392\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardeftab20\li392\fi-393\partightenfactor0
\ls1\ilvl0
\f4\fs24 \cf2 \up0 \nosupersub \ulnone {\listtext I. }\up0 \nosupersub \ulnone General Improvements\
\pard\tx360\tx752\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardeftab20\li752\fi-753\partightenfactor0
\ls1\ilvl1\cf2 \up0 \nosupersub \ulnone {\listtext A. }\up0 \nosupersub \ulnone Cut down the amount of iterations for the query TF-IDF by using set() of terms rather than each term if repeated in query\
\ls1\ilvl1\up0 \nosupersub \ulnone {\listtext B. }\up0 \nosupersub \ulnone Removed exact duplicates with
\f5\b MD5 hashing
\f4\b0 . Attempted to implement SIMHASH, but could not solve O(n^2) hamming distance problem.\
\ls1\ilvl1\up0 \nosupersub \ulnone {\listtext C. }\up0 \nosupersub \ulnone Removed IF checks and replaced with try except "better to ask for forgiveness than for permission"\
\ls1\ilvl1\up0 \nosupersub \ulnone {\listtext D. }\up0 \nosupersub \ulnone Replaced dicts with defaultdict to remove key checks\
\pard\tx20\tx392\tx720\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardeftab20\li392\fi-393\partightenfactor0
\ls1\ilvl0\cf2 \up0 \nosupersub \ulnone {\listtext II. }\up0 \nosupersub \ulnone Lecture Derived Search Improvements\
\pard\tx360\tx752\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardeftab20\li752\fi-753\partightenfactor0
\ls1\ilvl1\cf2 \up0 \nosupersub \ulnone {\listtext A. }
\f5\b \up0 \nosupersub \ulnone Minimal seek positions,
\f4\b0 made sure not stored in no more than 2-3 files/positions per file\
\ls1\ilvl1\up0 \nosupersub \ulnone {\listtext B. }
\f5\b \up0 \nosupersub \ulnone Pre-generated "maps"
\f4\b0 in their own JSON files, allowing for modularity in testing, and when loaded in memory before search time, allows for "access over calculation"\
\pard\tx720\tx1112\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardeftab20\li1112\fi-1113\partightenfactor0
\ls1\ilvl2\cf2 \up0 \nosupersub \ulnone {\listtext 1. }\up0 \nosupersub \ulnone \{docID:url\}, \
\ls1\ilvl2\up0 \nosupersub \ulnone {\listtext 2. }\up0 \nosupersub \ulnone \{url:docID\}, \
\ls1\ilvl2\up0 \nosupersub \ulnone {\listtext 3. }\up0 \nosupersub \ulnone \{docID:bolded_words\}, \
\ls1\ilvl2\up0 \nosupersub \ulnone {\listtext 4. }\up0 \nosupersub \ulnone \{docID:links\}, \
\ls1\ilvl2\up0 \nosupersub \ulnone {\listtext 5. }\up0 \nosupersub \ulnone \{docID:page_rank\}, \
\ls1\ilvl2\up0 \nosupersub \ulnone {\listtext 6. }\up0 \nosupersub \ulnone \{docID:hash\}, \
\ls1\ilvl2\up0 \nosupersub \ulnone {\listtext 7. }\up0 \nosupersub \ulnone [docID], \
\ls1\ilvl2\up0 \nosupersub \ulnone {\listtext 8. }\up0 \nosupersub \ulnone [termID], \
\ls1\ilvl2\up0 \nosupersub \ulnone {\listtext 9. }\up0 \nosupersub \ulnone \{term:corpus_frequency\}, \
\ls1\ilvl2\up0 \nosupersub \ulnone {\listtext 10. }\up0 \nosupersub \ulnone \{bolded_word:[docID]\}, \
\pard\tx360\tx752\tx1440\tx2160\tx2880\tx3600\tx4320\tx5040\tx5760\tx6480\tx7200\tx7920\tx8640\pardeftab20\li752\fi-753\partightenfactor0
\ls1\ilvl1\cf2 \up0 \nosupersub \ulnone {\listtext C. }
\f5\b \up0 \nosupersub \ulnone During document pruning
\f4\b0 process, method of
\f5\b iterating through sorted query TF-IDF
\f4\b0 of terms to append documents to selection was used.\
\ls1\ilvl1\up0 \nosupersub \ulnone {\listtext D. }\up0 \nosupersub \ulnone Set union each subset of docID's until the minimum document requirement is met. If it goes over, prune via set intersection.\
\ls1\ilvl1\up0 \nosupersub \ulnone {\listtext E. }\up0 \nosupersub \ulnone Stripping stop words from query if query is sufficiently long / ratio of stop words to contextual words is high\
\ls1\ilvl1\up0 \nosupersub \ulnone {\listtext F. }\up0 \nosupersub \ulnone Accuracy and speed was improved by utilizing
\f5\b bold / title / heading
\f4\b0 in tandem with base TF-IDF\
\ls1\ilvl1\up0 \nosupersub \ulnone {\listtext G. }\up0 \nosupersub \ulnone Implemented
\f5\b PageRank
\f4\b0 for additional accuracy.\
}