/**
* Licensed under the CC-GNU Lesser General Public License, Version 2.1 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://creativecommons.org/licenses/LGPL/2.1/
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Author: Choochart Haruechaiyasak
// Last update: 28 March 2006
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
namespace Trie
{
class Trie
{
protected Trie parent = null;
protected Trie[] child = new Trie[1];
protected int numChildren = 0;
protected char ch;
protected Boolean isWord = false;
//Creates a Trie using the root symbol as the character
public Trie() {
ch = (char)251;
}
public Trie(char c) {
ch = c;
}
//Used to create the trie nodes when a string is added to a trie
protected Trie createNode(char c) {
return new Trie(c);
}
//Inserts the trie as the last child
protected void addChild(Trie t) {
insertChild(t, numChildren);
}
//Inserts the trie at the specified index.
// If successful, the parent of the specified trie is updated to be this trie.
public void insertChild(Trie t, int index)
{
if (index < 0 || index > numChildren) {
throw new ArgumentException("required: index >= 0 && index <= numChildren");
}
if (t == null) {
throw new ArgumentException("cannot add null child");
}
if (t.parent != null) {
throw new ArgumentException("specified child still belongs to parent");
}
if (hasChar(t.ch)) {
throw new ArgumentException("duplicate chars not allowed");
}
if (isDescendent(t)) {
throw new ArgumentException("cannot add cyclic reference");
}
t.parent = this;
if (numChildren == child.Length) {
Trie[] arr = new Trie[2 * (numChildren + 1)];
for (int i = 0; i < numChildren; i++)
arr[i] = child[i];
child = arr;
}
for (int i = numChildren; i > index; i--)
child[i] = child[i - 1];
child[index] = t;
numChildren++;
}
//Returns true if this node is a descendent of the specified node or this node and the specified
//node are the same node, false otherwise.
public Boolean isDescendent(Trie t)
{
Trie r = this;
while (r != null)
{
if (r == t) {
return true;
}
r = r.parent;
}
return false;
}
//End of tree-level operations. Start of string operations.
//Adds the string to the trie. Returns true if the string is added or false if the string
//is already contained in the trie.
public Boolean add(String s)
{
return add(s, 0);
}
private Boolean add(String s, int index)
{
if (index == s.Length) {
if (isWord) {
return false;
}
isWord = true;
return true;
}
char c = s[index]; //char c=s.charAt(index);
for (int i = 0; i < numChildren; i++) {
if (child[i].ch == c) {
return child[i].add(s, index + 1);
}
}
// this code adds from the bottom to the top because the addChild method
// checks for cyclic references. This prevents quadratic runtime.
int ii = s.Length - 1;
Trie t = createNode(s[ii--]);
t.isWord = true;
while (ii >= index) {
Trie n = createNode(s[ii--]);
n.addChild(t);
t = n;
}
addChild(t);
return true;
}
//Returns the child that has the specified character or null if no child has the specified character.
public Trie getNode(char c)
{
for (int i = 0; i < numChildren; i++) {
if (child[i].ch == c) {
return child[i];
}
}
return null;
}
//Returns the last trie in the path that prefix matches the specified prefix string
//rooted at this node, or null if there is no such prefix path.
public Trie getNode(String prefix) {
return getNode(prefix, 0);
}
private Trie getNode(String prefix, int index)
{
if (index == prefix.Length) {
return this;
}
char c = prefix[index]; //char c=prefix.charAt(index);
for (int i = 0; i < numChildren; i++)
{
if (child[i].ch == c) {
return child[i].getNode(prefix, index + 1);
}
}
return null;
}
//Returns the number of nodes that define isWord as true, starting at this node and including
//all of its descendents. This operation requires traversing the tree rooted at this node.
public int size()
{
int size = 0;
if (isWord) {
size++;
}
for (int i = 0; i < numChildren; i++) {
size += child[i].size();
}
return size;
}
//Returns all of the words in the trie that begin with the specified prefix rooted at this node.
//An array of length 0 is returned if there are no words that begin with the specified prefix.
public String[] getWords(String prefix)
{
Trie n = getNode(prefix);
if (n == null) {
return new String[0];
}
String[] arr = new String[n.size()];
n.getWords(arr, 0);
return arr;
}
private int getWords(String[] arr, int x)
{
if (isWord) {
arr[x++] = ToString();
}
for (int i = 0; i < numChildren; i++) {
x = child[i].getWords(arr, x);
}
return x;
}
//Returns true if the specified string has a prefix path starting at this node.
//Otherwise false is returned.
public Boolean hasPrefix(String s)
{
Trie t = getNode(s);
if (t == null) {
return false;
}
return true;
}
//Check if the specified string is in the trie
//Retrun value if contains, 0 if hasPrefix, else -1
public int contains(String s)
{
Trie t = getNode(s);
if (t == null) {
return -1;
}
if (t.isWord) {
return 1;
}
else {
return 0;
}
}
//Returns true if this node has a child with the specified character.
public Boolean hasChar(char c)
{
for (int i = 0; i < numChildren; i++) {
if (child[i].ch == c) {
return true;
}
}
return false;
}
//Returns the number of nodes from this node up to the root node. The root node has height 0.
public int getHeight()
{
int h = -1;
Trie t = this;
while (t != null) {
h++;
t = t.parent;
}
return h;
}
//Returns a string containing the characters on the path from this node to the root, but
//not including the root character. The last character in the returned string is the
//character at this node.
public String toString()
{
StringBuilder sb = new StringBuilder(getHeight());
Trie t = this;
while (t.parent != null) {
sb.Append(t.ch);
t = t.parent;
}
return new string(sb.ToString().ToCharArray().Reverse().ToArray());
}
// End
}
}
Date :
2014-03-18 23:00:45
By :
love9713
No. 2
Guest
2. LongParseTree.cs
Code (C#)
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Collections;
namespace Trie
{
class LongParseTree
{
//Private variables
private Trie dict; //For storing words from dictionary
//Vector
private ArrayList indexList; //List of index positions
private ArrayList typeList; //List of word types
private ArrayList frontDepChar; //Front dependent characters: must have front characters
private ArrayList rearDepChar; //Rear dependent characters: must have rear characters
private ArrayList tonalChar; //Tonal characters
private ArrayList endingChar; //Ending characters
//Vector
/*******************************************************************/
/************************ Constructor ******************************/
/*******************************************************************/
public LongParseTree(Trie dict, ArrayList indexList, ArrayList typeList)
{
this.dict = dict;
this.indexList = indexList;
this.typeList = typeList;
frontDepChar = new ArrayList();
rearDepChar = new ArrayList();
tonalChar = new ArrayList();
endingChar = new ArrayList();
//Adding front-dependent characters
frontDepChar.AddRange(new string[] { "ะ", "ั", "า", "ำ", "ิ", "ี", "ึ", "ื", "ุ", "ู", "ๅ", "็", "์", "ํ" });
//Adding rear-dependent characters
rearDepChar.AddRange(new string[] {"ั", "ื", "เ", "แ", "โ", "ใ", "ไ", "ํ"});
//Adding tonal characters
tonalChar.AddRange(new string[] { "่", "้", "๊", "๋" });
//Adding ending characters
endingChar.AddRange(new string[] { "ๆ", "ฯ" });
}
/****************************************************************/
/********************** nextWordValid ***************************/
/****************************************************************/
private Boolean nextWordValid(int beginPos, String text)
{
int pos = beginPos + 1;
int status;
if (beginPos == text.Length) {
return true;
}
else if (text[beginPos] <= '~') { //Englurish alphabets/digits/special characters
return true;
}
else {
while (pos <= text.Length) {
status = dict.contains(text.Substring(beginPos, (pos - beginPos)));
if (status == 1) {
return true;
}
else if (status == 0) {
pos++;
}
else {
break;
}
}
}
return false;
}
/****************************************************************/
/********************** parseWordInstance ***********************/
/****************************************************************/
public int parseWordInstance(int beginPos, String text)
{
char prevChar = Convert.ToChar(0); //'\0'; //Previous character
int longestPos = -1; //Longest position
int longestValidPos = -1; //Longest valid position
int numValidPos = 0; //Number of longest value pos (for determining ambiguity)
int returnPos = -1; //Returned text position
int pos, status;
status = 1;
numValidPos = 0;
pos = beginPos + 1;
while ((pos <= text.Length) && (status != -1)) {
status = dict.contains(text.Substring(beginPos, (pos - beginPos)));
//Record longest so far
if (status == 1) {
longestPos = pos;
if (nextWordValid(pos, text)) {
longestValidPos = pos;
numValidPos++;
}
}
pos++;
} //while
//For checking rear dependent character
if (beginPos >= 1) {
prevChar = text[beginPos - 1];
}
//Unknow word
if (longestPos == -1) {
returnPos = beginPos + 1;
//Combine unknown segments
if ((indexList.Count > 0) && (frontDepChar.Contains("" + text[beginPos])) || (tonalChar.Contains("" + text[beginPos])) ||
(rearDepChar.Contains("" + prevChar)) || (((short)typeList[typeList.Count - 1] == 0))) {
indexList[indexList.Count - 1] = (short)returnPos;
typeList[typeList.Count - 1] = (short)0;
}
else {
indexList.Add((short)returnPos);
typeList.Add((short)0);
}
return returnPos;
}
//--------------------------------------------------
//Known or ambiguous word
else {
//If there is no merging point
if (longestValidPos == -1) {
//Check whether front char requires rear segment
if (rearDepChar.Contains("" + prevChar)) {
indexList[indexList.Count - 1] = (short)longestPos;
typeList[typeList.Count - 1] = (short)0;
}
else {
typeList.Add((short)1);
indexList.Add((short)longestPos);
}
return longestPos; //known followed by unknown: consider longestPos
}
else {
//Check whether front char requires rear segment
if (rearDepChar.Contains("" + prevChar)) {
indexList[indexList.Count - 1] = (short)longestValidPos;
typeList[typeList.Count - 1] = (short)0;
}
else if (numValidPos == 1) {
typeList.Add((short)1); //know
indexList.Add((short)longestValidPos);
}
else {
typeList.Add((short)2); //ambiguous
indexList.Add((short)longestValidPos);
}
return (longestValidPos);
}
}
}
}
}
Date :
2014-03-18 23:08:31
By :
love9713
No. 3
Guest
3. LonglexTo.cs
Code (C#)
// LongLexTo: Tokenizing Thai texts using Longest Matching Approach
// Note: Types: 0=unknown 1=known 2=ambiguous 3=English/digits 4=special characters
//
// Public methods:
// 1) public LongLexTo(File dictFile); //Constructor with a dictionary file
// 2) public void addDict(File dictFile); //Add dictionary (e.g., unknown-word file)
// 3) public void wordInstance(String text); //Word tokenization
// 4) public void lineInstance(String text); //Line-break tokenization
// 4) public Vector getIndexList();
// 5) Iterator's public methods: hasNext, first, next
//
// Author: Choochart Haruechaiyasak
// Last update: 28 March 2006
using System;
using System.Collections.Generic;
using System.Linq;
using System.Text;
using System.Collections;
using System.IO;
namespace Trie
{
class LongLexTo
{
//Private variables
private Trie dict; //For storing words from dictionary
private LongParseTree ptree; //Parsing tree (for Thai words)
//Returned variables
private ArrayList indexList; //List of word index positions
private ArrayList lineList; //List of line index positions
private ArrayList typeList; //List of word types (for word only)
private IEnumerator iter; //Iterator for indexList OR lineList (depends on the call)
/*******************************************************************/
/*********************** Return index list *************************/
/*******************************************************************/
public ArrayList getIndexList()
{
return indexList;
}
/*******************************************************************/
/*********************** Return type list *************************/
/*******************************************************************/
public ArrayList getTypeList()
{
return typeList;
}
/*******************************************************************/
/******************** Iterator for index list **********************/
/*******************************************************************/
//Return iterator's hasNext for index list
public Boolean hasNext()
{
if (!iter.MoveNext()) {
return false;
}
return true;
}
//Return iterator's first index
public int first()
{
return 0;
}
//Return iterator's next index
public int next()
{
return (short)iter.Current;
}
/*******************************************************************/
/********************** Constructor (default) **********************/
/*******************************************************************/
public LongLexTo()
{
dict = new Trie();
indexList = new ArrayList();
lineList = new ArrayList();
typeList = new ArrayList();
ptree = new LongParseTree(dict, indexList, typeList);
}
public LongLexTo(String input)
{
//System.out.println("Self dict");
dict = new Trie();
indexList = new ArrayList();
lineList = new ArrayList();
typeList = new ArrayList();
ptree = new LongParseTree(dict, indexList, typeList);
}
public LongLexTo(StreamReader input)
{
dict = new Trie();
if (File.Exists("lexitron.txt")) {
StreamReader sr = File.OpenText("lexitron.txt");
addDict(sr);
//System.out.println("ADDED DICT");
}
else {
//System.out.println(" !!! Error: The dictionary file is not found, " + dictFile.getName());
}
indexList = new ArrayList();
lineList = new ArrayList();
typeList = new ArrayList();
ptree = new LongParseTree(dict, indexList, typeList);
}
//...
/*******************************************************************/
/**************************** addDict ******************************/
/*******************************************************************/
public void addDict(String line)
{
line = line.Trim();
if (line.Length > 0) {
dict.add(line);
}
}
public void addDict(StreamReader dictFile) //File dictFile
{
//Read words from dictionary
String line = ""; //, word, word2;
using (StreamReader sr = dictFile) {
while ((line = sr.ReadLine()) != null) {
line = line.Trim();
if (line.Length > 0) {
dict.add(line);
}
//yield return line;
}
}
} //addDict
/****************************************************************/
/************************** wordInstance ************************/
/****************************************************************/
public void wordInstance(String text)
{
//System.out.println("I'm In wordInStance");
indexList.Clear();
typeList.Clear();
int pos, index;
String word;
Boolean found;
char ch;
pos = 0;
while (pos < text.Length) {
//Check for special characters and English words/numbers
ch = text[pos];
//English
if (((ch >= 'A') && (ch <= 'Z')) || ((ch >= 'a') && (ch <= 'z'))) {
while ((pos < text.Length) && (((ch >= 'A') && (ch <= 'Z')) || ((ch >= 'a') && (ch <= 'z')))) {
ch = text[pos++];
}
if (pos < text.Length) {
pos--;
}
indexList.Add((short)pos);
typeList.Add((short)3);
}
//Digits
else if (((ch >= '0' && ch <= '9')) || ((ch >= '๐') && (ch <= '๙'))) {
while ((pos < text.Length) && (((ch >= '0') && (ch <= '9')) || ((ch >= '๐') && (ch <= '๙')) || (ch == ',') || (ch == '.'))) {
ch = text[pos++];
}
if (pos < text.Length) {
pos--;
}
indexList.Add((short)pos);
typeList.Add((short)3);
}
//Special characters
else if ((ch <= '~') || (ch == 'ๆ') || (ch == 'ฯ') || (ch == '“') || (ch == '”') || (ch == ',')) {
pos++;
indexList.Add((short)pos);
typeList.Add((short)4);
}
//Thai word (known/unknown/ambiguous)
else {
pos = ptree.parseWordInstance(pos, text);
}
}//While all text length
iter = (IEnumerator)indexList.GetEnumerator();
} //wordInstance
/****************************************************************/
/************************** lineInstance ************************/
/****************************************************************/
public void lineInstance(String text)
{
int windowSize = 10; //for detecting parentheses, quotes
int curType, nextType, tempType, curIndex, nextIndex, tempIndex;
lineList.Clear();
wordInstance(text);
int i;
for (i = 0; i < typeList.Count -1; i++) {
curType = (short)typeList[i];
curIndex = (short)indexList[i];
if ((curType == 3) || (curType == 4)) {
//Parenthesese
if ((curType == 4) && (text[curIndex - 1] == '(')) {
int pos = i + 1;
while ((pos < typeList.Count) && (pos < i + windowSize)) {
tempType = (short)typeList[pos];
tempIndex = (short)indexList[pos++];
if ((tempType == 4) && (text[tempIndex - 1] == ')')) {
lineList.Add((short)tempIndex);
i = pos - 1;
break;
}
}
}
//Single quote
else if ((curType == 4) && (text[curIndex - 1] == '\'')) {
int pos = i + 1;
while ((pos < typeList.Count) && (pos < i + windowSize)) {
tempType = (short)typeList[pos];
tempIndex = (short)indexList[pos++];
if ((tempType == 4) && (text[tempIndex - 1] == '\'')) {
lineList.Add((short)tempIndex);
i = pos - 1;
break;
}
}
}
//Double quote
else if ((curType == 4) && (text[curIndex - 1] == '\"')) {
int pos = i + 1;
while ((pos < typeList.Count) && (pos < i + windowSize)) {
tempType = (short)typeList[pos];
tempIndex = (short)indexList[pos++];
if ((tempType == 4) && (text[tempIndex - 1] == '\'')) {
lineList.Add((short)tempIndex);
i = pos - 1;
break;
}
}
}
else {
lineList.Add((short)curIndex);
}
}
else {
nextType = (short)typeList[i + 1];
nextIndex = (short)indexList[i + 1];
if ((nextType == 3) || ((nextType == 4) && ((text[nextIndex - 1] == ' ') || (text[nextIndex - 1] == '\"') ||
(text[nextIndex - 1] == '(') || (text[nextIndex - 1] == '\'')))) {
}
else {
if ((curType == 1) && (nextType != 0) && (nextType != 4)) {
lineList.Add((short)indexList[i]);
}
}
if (1 == 2) {
//do something
}
else if (1 == 3) {
//do something
}
}
}
if (i < typeList.Count) {
lineList.Add((short)indexList.Count -1);
}
iter = (IEnumerator)lineList;
}//lineInstance
}//class LongLexTo
}
Date :
2014-03-18 23:20:13
By :
love9713
No. 4
Guest
ตัวอย่างการใช้งาน
Code (C#)
//
//ทดสอบ โปรแกรม
//
private void button1_Click(object sender, EventArgs e)
{
LongLexTo Tokenizer = new LongLexTo("Self");
//LongLexTo Tokenizer = new LongLexTo(System.IO.File.OpenText("lexitron.txt"));
if (System.IO.File.Exists("unknown.txt")) {
System.IO.StreamReader unknownFile = System.IO.File.OpenText("unknown.txt");
Tokenizer.addDict(unknownFile);
}
ArrayList typeList;
int begin, end, type;
String line = "อยู่ดีชอบกินหอย";
Tokenizer.addDict("อยู่ดี");
Tokenizer.addDict("ชอบ");
Tokenizer.addDict("กินหอย");
if (File.Exists("Jim.txt")) {
try {
File.Delete("Jim.txt");
}
catch (Exception) {
throw new Exception("Cann't delete Jim.txt file.");
}
}
//Write test file.
using (FileStream fs = File.Create("Jim.txt", 1024)) {
Byte[] info = new System.Text.UTF8Encoding(true).GetBytes("ภาษาไทย This is some text in the file.");
fs.Write(info, 0, info.Length);
}
Tokenizer.wordInstance(line);
typeList = Tokenizer.getTypeList();
begin = Tokenizer.first();
int i = 0;
String result = "";
while (Tokenizer.hasNext()) {
end = Tokenizer.next();
type = (short)typeList[i];
result += line.Substring(begin, end - begin) + "|";
begin = end;
}
System.Windows.Forms.MessageBox.Show(result);
}