Class: JLDrill::Tatoeba::ChineseIndexFile
- Defined in:
- lib/jldrill/model/Tatoeba.rb
Constant Summary collapse
- LINK_RE =
/^(\d*)[\t](\d*)/
- CHINESE_INDEX_RE =
/^(\d*)[\t]cmn/
- ENGLISH_INDEX_RE =
/^(\d*)[\t]eng/
Instance Attribute Summary
Attributes inherited from DataFile
#encoding, #file, #lines, #parsed, #publisher, #stepSize
Instance Method Summary collapse
- #dataSize ⇒ Object
-
#finishParsing ⇒ Object
Don’t erase @lines because we need them later.
-
#getPositions(kanji) ⇒ Object
Return an array of positions in the chineseIndeces for which the respective sentence contains the given kanji.
-
#initialize(sentences) ⇒ ChineseIndexFile
constructor
A new instance of ChineseIndexFile.
- #loaded? ⇒ Boolean
- #parseEntry ⇒ Object
- #search(kanji, reading) ⇒ Object
Methods inherited from DataFile
#createLines, #eof?, #findEncoding, #fraction, #load, #parse, #parseChunk, #parser, #readLines, #reset, #setLoaded, #shortFilename
Constructor Details
#initialize(sentences) ⇒ ChineseIndexFile
Returns a new instance of ChineseIndexFile.
117 118 119 120 121 122 123 124 |
# File 'lib/jldrill/model/Tatoeba.rb', line 117 def initialize(sentences) super() @sentences = sentences @chineseIndeces = [] @englishIndeces = [] @stepSize = 10000 @ruledOut = 0 end |
Instance Method Details
#dataSize ⇒ Object
160 161 162 |
# File 'lib/jldrill/model/Tatoeba.rb', line 160 def dataSize @chineseIndeces.size end |
#finishParsing ⇒ Object
Don’t erase @lines because we need them later
165 166 167 |
# File 'lib/jldrill/model/Tatoeba.rb', line 165 def finishParsing setLoaded(true) end |
#getPositions(kanji) ⇒ Object
Return an array of positions in the chineseIndeces for which the respective sentence contains the given kanji
176 177 178 179 180 |
# File 'lib/jldrill/model/Tatoeba.rb', line 176 def getPositions(kanji) return (0..@chineseIndeces.size - 1).find_all do |i| @sentences.sentenceAt(@chineseIndeces[i]).match(kanji) end end |
#loaded? ⇒ Boolean
169 170 171 172 |
# File 'lib/jldrill/model/Tatoeba.rb', line 169 def loaded? retVal = super return retVal end |
#parseEntry ⇒ Object
126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 |
# File 'lib/jldrill/model/Tatoeba.rb', line 126 def parseEntry if LINK_RE.match(@lines[@parsed]) cindex = $1.to_i eindex = $2.to_i # We are only intereste in Chinese sentences. We'll # first check the index on the left hand side. If it is # not Chinese, we will ignore all the rest of the entries # with the same index (they are in order of the left hand side # so we just have to keep track of the last one). If it is # Chinese, we will keep checking the right hand entry # until we find English. Then we will ignore all the rest # of the entries. if cindex != @ruledOut chinese = @sentences.dataAt(cindex) english = @sentences.dataAt(eindex) if CHINESE_INDEX_RE.match(chinese) if ENGLISH_INDEX_RE.match(english) @chineseIndeces.push(cindex) @englishIndeces.push(eindex) # We've found the English for this Chinese # sentence, so don't process the following ones # with the same index @ruledOut = cindex end else # It's not a Chinese sentences, so don't process # the following ones with the same index @ruledOut = cindex end end end @parsed += 1 end |
#search(kanji, reading) ⇒ Object
182 183 184 185 186 187 188 189 190 191 192 |
# File 'lib/jldrill/model/Tatoeba.rb', line 182 def search(kanji, reading) retVal = [] positions = getPositions(kanji) positions.each do |i| cindex = @chineseIndeces[i] eindex = @englishIndeces[i] usage = JLDrill::VocabularyUsage.from_B_line(kanji) retVal.push(TatoebaExample.new(cindex, eindex, usage, @sentences)) end return retVal end |