-
Notifications
You must be signed in to change notification settings - Fork 0
/
DocumentContent.cs
41 lines (38 loc) · 1.25 KB
/
DocumentContent.cs
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
using System;
using System.Collections.Generic;
using System.IO;
using System.Linq;
namespace Antiplagiarism
{
public class DocumentContent
{
public string DocumentName;
public List<string> Tokens;
private readonly List<string> textWithWhiteSpaces;
public DocumentContent(string documentName)
{
var text = File.ReadAllText(documentName);
DocumentName = Path.GetFileNameWithoutExtension(documentName);
textWithWhiteSpaces = Tokenizer.Tokenize(text).ToList();
Tokens = textWithWhiteSpaces
.Where(token => token.All(c => !char.IsWhiteSpace(c)))
.ToList();
}
public IEnumerable<Tuple<string, TokenType>> DevideToCommonAndSpecificTokens(List<string> commonTokens)
{
int i = 0;
foreach (var token in textWithWhiteSpaces)
{
if (i != commonTokens.Count && token == commonTokens[i])
{
yield return Tuple.Create(token, TokenType.Common);
++i;
}
else
{
yield return Tuple.Create(token, TokenType.Specific);
}
}
}
}
}