-
-
Notifications
You must be signed in to change notification settings - Fork 25
/
EmailSynchronizer.php
243 lines (211 loc) · 8.7 KB
/
EmailSynchronizer.php
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
<?php declare(strict_types=1);
namespace Externals;
use DateTime;
use DateTimeInterface;
use DateTimeZone;
use Doctrine\DBAL\Exception\UniqueConstraintViolationException;
use Externals\Email\Email;
use Externals\Email\EmailAddressParser;
use Externals\Email\EmailContentParser;
use Externals\Email\EmailRepository;
use Externals\Email\EmailSubjectParser;
use Externals\Search\SearchIndex;
use Psr\Log\LoggerInterface;
use Rvdv\Nntp\Client;
use Rvdv\Nntp\Command\ArticleCommand;
use Rvdv\Nntp\Connection\Connection;
use Rvdv\Nntp\Exception\UnknownHandlerException;
use Throwable;
use ZBateson\MailMimeParser\Header\DateHeader;
use ZBateson\MailMimeParser\IMessage;
use ZBateson\MailMimeParser\MailMimeParser;
class EmailSynchronizer
{
/**
* Some articles that should never
* be attempted to be fetched.
*/
public const BROKEN_MESSAGES = [
992,
27418,
69049,
69050,
// See https://github.com/mnapoli/externals/issues/173
117903,
// See https://github.com/mnapoli/externals/issues/191
121607,
];
public function __construct(
private EmailRepository $emailRepository,
private EmailSubjectParser $subjectParser,
private EmailContentParser $contentParser,
private SearchIndex $searchIndex,
private LoggerInterface $logger,
private \Doctrine\DBAL\Connection $db
) {
}
public function synchronize(?int $maxNumberOfEmailsToSynchronize = null): void
{
$client = new Client(new Connection('news.php.net', 119));
$client->connect();
$group = $client->group('php.internals');
$numberOfLastEmailToSynchronize = (int) $group['last'];
$numberOfLastEmailSynchronized = $this->emailRepository->getLastEmailNumber();
if ($maxNumberOfEmailsToSynchronize !== null) {
$this->logger->info(sprintf(
'%d emails will be synchronized',
min($numberOfLastEmailToSynchronize - $numberOfLastEmailSynchronized, $maxNumberOfEmailsToSynchronize)
));
}
$count = 0;
for ($number = $numberOfLastEmailSynchronized + 1; $number <= $numberOfLastEmailToSynchronize; $number++) {
$count++;
if (in_array($number, self::BROKEN_MESSAGES)) {
$this->logger->warning("Skipping blacklisted message $number");
continue;
}
$this->logger->info("Synchronizing message $number");
try {
$rawContent = $client->sendCommand(new ArticleCommand((string) $number));
} catch (UnknownHandlerException) {
// Some messages seem to trigger errors on the news server and we cannot fetch them
$this->logger->warning("Cannot fetch message $number, skipping");
continue;
}
$this->synchronizeEmail($number, $rawContent);
if ($maxNumberOfEmailsToSynchronize !== null && ($count >= $maxNumberOfEmailsToSynchronize)) {
break;
}
}
$client->disconnect();
// Refresh threads
if ($count > 0) {
$this->emailRepository->refreshThreads();
}
}
public function synchronizeEmail(int $number, string $source): void
{
// Check that the string is valid UTF-8, else we cannot store it in database or do anything with it
if (! mb_check_encoding($source, 'UTF-8')) {
$this->logger->warning("Cannot synchronize message $number because it contains invalid UTF-8 characters");
return;
}
$mailParser = new MailMimeParser;
$parsedDocument = $mailParser->parse($source, false);
$subject = $this->subjectParser->sanitize((string) $parsedDocument->getHeaderValue('subject'));
$content = $this->contentParser->parse((string) $parsedDocument->getTextContent());
// We don't use the special AddressHeader class because it doesn't seem to parse the
// person's name at all
$fromHeader = $parsedDocument->getHeader('from');
if (! $fromHeader) {
$this->logger->warning("Cannot synchronize message $number because it contains no 'from' header");
return;
}
$emailAddressParser = new EmailAddressParser($fromHeader->getRawValue());
$fromArray = $emailAddressParser->parse();
$from = reset($fromArray);
$emailId = $parsedDocument->getHeaderValue('message-id');
// Extract the message we're replying to from the "In-Reply-To" header
$inReplyTo = null;
$inReplyToHeader = $parsedDocument->getHeaderValue('In-Reply-To');
if ($inReplyToHeader) {
$inReplyToHeader = preg_split('/(?<=>)/', $inReplyToHeader);
$inReplyToHeader = array_filter(array_map('trim', $inReplyToHeader));
// Take the first item
if (! empty($inReplyToHeader)) {
/** @var string $inReplyTo */
$inReplyTo = reset($inReplyToHeader);
}
}
// Extract the thread ID from the "references" header
$firstReference = null;
$references = $parsedDocument->getHeaderValue('References');
if ($references) {
$references = preg_split('/(?<=>)/', $references);
$references = array_filter(array_map('trim', $references));
if (! empty($references)) {
$firstReference = reset($references);
if (! $inReplyTo) {
// In old mails the `In-Reply-To` header didn't exist, instead it was at the end of the references
// Example: https://externals.io/message/2536#2784
$inReplyTo = end($references);
}
}
}
$threadId = null;
if ($firstReference !== null) {
// When using the iPhone mailer, references may not have the root email of the thread.
// See https://github.com/mnapoli/externals/pull/189/files
$threadId = $this->findEmailThreadId($firstReference);
} else if ($inReplyTo !== null) {
// We know it is a reply to an email but we weren't able to find the thread ID: let's find it from our database
$threadId = $this->findEmailThreadId($inReplyTo);
}
// No thread ID: this is a new thread
if ($threadId === null) {
$threadId = $emailId;
}
$date = $this->parseDateTime($parsedDocument);
if (! $date) {
$this->logger->warning("Cannot synchronize message $number because it contains an invalid date");
return;
}
$newEmail = new Email(
$emailId,
$number,
$subject,
$content,
$source,
$threadId,
$date,
$from,
$inReplyTo
);
$this->db->transactional(function () use ($newEmail): void {
try {
$this->emailRepository->add($newEmail);
} catch (UniqueConstraintViolationException) {
// For some reason the email ID was already used...
$this->logger->warning("Cannot synchronize message {$newEmail->getNumber()} because the email ID {$newEmail->getId()} already exists in database");
return;
}
// Index in Algolia
$this->searchIndex->indexEmail($newEmail);
});
}
private function parseDateTime(IMessage $parsedDocument): ?DateTimeInterface
{
$dateHeader = $parsedDocument->getHeader('date');
$date = null;
if ($dateHeader instanceof DateHeader) {
$date = $dateHeader->getDateTime();
assert($date instanceof DateTime);
}
// Some dates cannot be parsed using the standard format, for example "13 Mar 2003 12:44:07 -0500"
try {
$date = $date ?: new DateTime($dateHeader->getValue());
} catch (Throwable) {
// Some dates cannot be parsed
return null;
}
// We store all the dates in UTC
$date->setTimezone(new DateTimeZone('UTC'));
return $date;
}
private function findEmailThreadId(string $targetId): ?string
{
try {
$email = $this->emailRepository->getById($targetId);
} catch (NotFound) {
// We didn't find the thread, let's move on
return null;
}
// If getThreadId() is not null that means $email is inside a thread (but not the root)
if ($email->getThreadId()) {
// Then we return the thread root ID
return $email->getThreadId();
}
// In the other case that means that $email is the thread root
return $email->getId();
}
}