-
Notifications
You must be signed in to change notification settings - Fork 1
/
TglTokenizer.java
132 lines (96 loc) · 4.56 KB
/
TglTokenizer.java
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
/*============================================================================*
* Title : Tagalog Stemmer
* Description : Tagalog Stemmer made with Java.
* Filename : TglTokenizer.java
* Version : v1.0
* Author : Santelices, Andrew P.
* Yr&Sec&Uni : BSCS 3-3 PUP Main
* Subject : Natural Language Processing
*============================================================================*/
import java.util.*;
import java.io.*;
import java.lang.*;
public class TglTokenizer{
// Just for initialization of object instantiation
public TglTokenizer(){}
// Class contructor with arguments
public TglTokenizer(String text){
tokenize(text);
}
// Gets input from a file and pass it tokenize() function
private ArrayList<String> tokenizeFromFile(String filename){
String strInput = new String(), // Variable to hold input to be tokenized
strLine = new String();
try{
BufferedReader buffRead = new BufferedReader(new FileReader(filename));
while((strLine = buffRead.readLine()) != null){
strInput += strLine +'\n';
}
buffRead.close();
}catch(Exception ex){
ex.printStackTrace();
}
return tokenize(strInput);
}
// Tokenizes input -- returns array of tokens
private ArrayList<String> tokenize(String text){
ArrayList<String> tokens = new ArrayList<String>(),
word = new ArrayList<String>(),
punct = new ArrayList<String>();
StringBuilder strBuild = new StringBuilder();
text = text + '\0';
// Iterates through the text
for(int intCtr = 0; intCtr < text.length(); intCtr++){
char currChar = text.charAt(intCtr);
//Checks if whitespace or null or newline
if(Character.isWhitespace(currChar) || currChar == '\0' || currChar == '\n'){
if(word.isEmpty() && punct.isEmpty()) // Checks if there are any words yet stored in 'word'. None? Skip iteration.
continue; // ^ Or punctuations in 'punct'.
else if(!word.isEmpty()) // If 'word' is not empty, add it to 'tokens' List, then clears 'word' List.
addToken(strBuild, tokens, word); // Adds a word or punctuation to 'token'
else if(!punct.isEmpty()) // Same treatment with 'word'.
addToken(strBuild, tokens, punct);
}else if(Character.isLetter(currChar) || currChar == '-'){
word.add(Character.toString(currChar)); // Adds every char to 'word' List.
addToken(strBuild, tokens, punct); // Adds a punct if a word is found.
}else if(Character.toString(currChar).matches("\\p{Punct}")){
punct.add(Character.toString(currChar)); // Adds every punct char to 'punct' List.
addToken(strBuild, tokens, word); // Adds a word if a punctuation is found.
}
}
return tokens;
}
// Adds a word or a punctuation to the list of tokens
private void addToken(StringBuilder sb, ArrayList<String> tokens, ArrayList<String> value){
if(!value.isEmpty()){
// Converts 'value' List to String before adding to 'tokens' List.
for(String str : value)
sb.append(str);
tokens.add(sb.toString());
value.clear();
sb.setLength(0); // Clears strBuild buffer
}
}
/* Comment out/remove this main method when this tokenizer will be implemented
to the other modules. */
public ArrayList<String> GetParams() {
TglTokenizer tokenizer = new TglTokenizer();
String text = null;
ArrayList<String> tokens = new ArrayList<String>();
System.out.println("Enter text from: ");
System.out.print("\t[1] Input Stream\n\t[2] File\n\t>> ");
Scanner input = new Scanner(System.in);
int choice = input.nextInt();
input.nextLine(); //Throws away /n (problem with input buffers)
if(choice == 1){
System.out.print("Enter text: ");
text = input.nextLine();
tokens = tokenizer.tokenize(text);
}else if(choice == 2){
System.out.print("Enter filename: ");
tokens = tokenizer.tokenizeFromFile("tagdic.txt");
}else
System.out.println("Invalid input!");
return tokens;
}
}