Class: JLDrill::Tatoeba::ChineseIndexFile

Inherits:
DataFile
  • Object
show all
Defined in:
lib/jldrill/model/Tatoeba.rb

Constant Summary collapse

/^(\d*)[\t](\d*)/
CHINESE_INDEX_RE =
/^(\d*)[\t]cmn/
ENGLISH_INDEX_RE =
/^(\d*)[\t]eng/

Instance Attribute Summary

Attributes inherited from DataFile

#encoding, #file, #lines, #parsed, #publisher, #stepSize

Instance Method Summary collapse

Methods inherited from DataFile

#createLines, #eof?, #findEncoding, #fraction, #load, #parse, #parseChunk, #parser, #readLines, #reset, #setLoaded, #shortFilename

Constructor Details

#initialize(sentences) ⇒ ChineseIndexFile

Returns a new instance of ChineseIndexFile.



117
118
119
120
121
122
123
124
# File 'lib/jldrill/model/Tatoeba.rb', line 117

def initialize(sentences)
    super()
    @sentences = sentences
    @chineseIndeces = []
    @englishIndeces = []
    @stepSize = 10000
    @ruledOut = 0
end

Instance Method Details

#dataSizeObject



160
161
162
# File 'lib/jldrill/model/Tatoeba.rb', line 160

def dataSize
    @chineseIndeces.size
end

#finishParsingObject

Don’t erase @lines because we need them later



165
166
167
# File 'lib/jldrill/model/Tatoeba.rb', line 165

def finishParsing
    setLoaded(true)
end

#getPositions(kanji) ⇒ Object

Return an array of positions in the chineseIndeces for which the respective sentence contains the given kanji



176
177
178
179
180
# File 'lib/jldrill/model/Tatoeba.rb', line 176

def getPositions(kanji)
    return (0..@chineseIndeces.size - 1).find_all do |i|
        @sentences.sentenceAt(@chineseIndeces[i]).match(kanji)
    end
end

#loaded?Boolean

Returns:

  • (Boolean)


169
170
171
172
# File 'lib/jldrill/model/Tatoeba.rb', line 169

def loaded?
    retVal = super
    return retVal
end

#parseEntryObject



126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# File 'lib/jldrill/model/Tatoeba.rb', line 126

def parseEntry
    if LINK_RE.match(@lines[@parsed])
        cindex = $1.to_i
        eindex = $2.to_i
        # We are only intereste in Chinese sentences.  We'll
        # first check the index on the left hand side.  If it is
        # not Chinese, we will ignore all the rest of the entries
        # with the same index (they are in order of the left hand side
        # so we just have to keep track of the last one).  If it is
        # Chinese, we will keep checking the right hand entry
        # until we find English.  Then we will ignore all the rest
        # of the entries.
        if cindex != @ruledOut
            chinese = @sentences.dataAt(cindex)
            english = @sentences.dataAt(eindex)
            if CHINESE_INDEX_RE.match(chinese)
                if ENGLISH_INDEX_RE.match(english)
                    @chineseIndeces.push(cindex)
                    @englishIndeces.push(eindex)
                    # We've found the English for this Chinese
                    # sentence, so don't process the following ones
                    # with the same index
                    @ruledOut = cindex
                end
            else
                # It's not a Chinese sentences, so don't process
                # the following ones with the same index
                @ruledOut = cindex
            end
        end
    end
    @parsed += 1
end

#search(kanji, reading) ⇒ Object



182
183
184
185
186
187
188
189
190
191
192
# File 'lib/jldrill/model/Tatoeba.rb', line 182

def search(kanji, reading)
    retVal = []
    positions = getPositions(kanji)
    positions.each do |i|
        cindex = @chineseIndeces[i]
        eindex = @englishIndeces[i]
        usage = JLDrill::VocabularyUsage.from_B_line(kanji)
        retVal.push(TatoebaExample.new(cindex, eindex, usage, @sentences))
    end
    return retVal
end