diff -ur chasen-2.3.1/ChangeLog chasen/ChangeLog --- chasen-2.3.1/ChangeLog 2003-06-18 00:29:09.000000000 +0900 +++ chasen/ChangeLog 2003-06-27 20:53:00.000000000 +0900 @@ -1,3 +1,14 @@ +2003-06-27 TAKAOKA Kazuma + + * lib/literal.c: 'EUCJP' -> 'EUC-JP' and so on. + + * mkchadic/makeda.cpp: Make padding for memory alignment. + + * lib/dartsdic.cpp (da_open): Fix allocate order. + Thanks to ITO Yoshiharu + + * lib/literal.c (jlit_init): Remove debug output. + 2003-06-17 Masayuki Asahara * documents modification diff -ur chasen-2.3.1/lib/dartsdic.cpp chasen/lib/dartsdic.cpp --- chasen-2.3.1/lib/dartsdic.cpp 2003-02-25 10:39:33.000000000 +0900 +++ chasen/lib/dartsdic.cpp 2003-06-27 20:50:52.000000000 +0900 @@ -35,7 +35,7 @@ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - * $Id: dartsdic.cpp,v 1.7 2003/02/25 01:39:33 kazuma-t Exp $ + * $Id: dartsdic.cpp,v 1.9 2003/06/27 11:50:52 kazuma-t Exp $ */ #include @@ -58,9 +58,9 @@ darts_t *da; Darts::DoubleArray *darts = new Darts::DoubleArray; + da = (darts_t*)cha_malloc(sizeof(darts_t)); da->da_mmap = cha_mmap_file(daname); darts->setArray(cha_mmap_map(da->da_mmap)); - da = (darts_t*)cha_malloc(sizeof(darts_t)); da->da = darts; da->lex_mmap = cha_mmap_file(lexname); da->dat_mmap = cha_mmap_file(datname); @@ -106,6 +106,7 @@ reading_len = ((unsigned char *)dat_map(da))[index++]; pron_len = ((unsigned char *)dat_map(da))[index++]; base_len = ((unsigned char *)dat_map(da))[index++]; + index++; // skip padding info_len = *((unsigned short *)((unsigned char *)dat_map(da) + index)); index += sizeof(unsigned short); diff -ur chasen-2.3.1/lib/dartsdic.h chasen/lib/dartsdic.h --- chasen-2.3.1/lib/dartsdic.h 2002-12-02 23:25:57.000000000 +0900 +++ chasen/lib/dartsdic.h 2003-06-27 20:50:52.000000000 +0900 @@ -1,5 +1,5 @@ /* - * $Id: dartsdic.h,v 1.2 2002/12/02 14:25:57 kazuma-t Exp $ + * $Id: dartsdic.h,v 1.3 2003/06/27 11:50:52 kazuma-t Exp $ */ #ifndef __DARTSDIC_H__ @@ -8,12 +8,12 @@ typedef struct __darts_t darts_t; typedef struct { - unsigned short posid __attribute__ ((packed)); - unsigned char inf_type __attribute__ ((packed)); - unsigned char inf_form __attribute__ ((packed)); - unsigned short weight __attribute__ ((packed)); - short con_tbl __attribute__ ((packed)); - long dat_index __attribute__ ((packed)); + unsigned short posid; + unsigned char inf_type; + unsigned char inf_form; + unsigned short weight; + short con_tbl; + long dat_index; } da_lex_t; darts_t *da_open(char*, char*, char*); diff -ur chasen-2.3.1/lib/literal.c chasen/lib/literal.c --- chasen-2.3.1/lib/literal.c 2003-06-04 23:19:35.000000000 +0900 +++ chasen/lib/literal.c 2003-06-27 20:53:00.000000000 +0900 @@ -32,7 +32,7 @@ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - * $Id: literal.c,v 1.1 2003/06/04 14:19:35 kazuma-t Exp $ + * $Id: literal.c,v 1.3 2003/06/27 11:53:00 kazuma-t Exp $ */ #include @@ -173,10 +173,10 @@ }; static char *encode_list[] = { - "EUCJP", /* CHASEN_ENCODE_EUCJP */ - "SJIS", /* CHASEN_ENCODE_SJIS */ + "EUC-JP", /* CHASEN_ENCODE_EUCJP */ + "Shift_JIS", /* CHASEN_ENCODE_SJIS */ "ISO8859-1", /* CHASEN_ENCODE_ISO8859 */ - "UTF8", /* CHASEN_ENCODE_UTF8 */ + "UTF-8", /* CHASEN_ENCODE_UTF8 */ }; static void @@ -188,14 +188,13 @@ if (encode == NULL) encode = encode_list[Cha_encode]; - cd = iconv_open(encode, "EUCJP"); + cd = iconv_open(encode, "EUC-JP"); if (cd == (iconv_t)-1) { fprintf(stderr, "%s is invalid encoding schema, ", encode); - fprintf(stderr, "will use 'EUCJP'\n"); + fprintf(stderr, "will use 'EUC-JP'\n"); i = 0; do { cha_literal[i][2] = cha_literal[i][0]; - fprintf(stderr, "%s\n", cha_literal[i][2]); } while (cha_literal[++i][0] != NULL); return; } diff -ur chasen-2.3.1/mkchadic/makeda.cpp chasen/mkchadic/makeda.cpp --- chasen-2.3.1/mkchadic/makeda.cpp 2003-02-16 16:27:08.000000000 +0900 +++ chasen/mkchadic/makeda.cpp 2003-06-27 20:50:52.000000000 +0900 @@ -35,7 +35,7 @@ * NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS * SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. * - * $Id: makeda.cpp,v 1.5 2003/02/16 07:27:08 kazuma-t Exp $ + * $Id: makeda.cpp,v 1.6 2003/06/27 11:50:52 kazuma-t Exp $ */ #include @@ -46,6 +46,7 @@ #include #include #include +#include "dartsdic.h" const int CHAINT_OFFSET = 11; const int CHAINT_SCALE = (256 - 11); @@ -117,11 +118,17 @@ fout << (unsigned char)entry.reading.size(); fout << (unsigned char)entry.pron.size(); fout << (unsigned char)entry.base.size(); + fout << '\0'; // padding fout.write((char *)&info_len, sizeof(unsigned short)); fout << entry.reading << '\0' << entry.pron << '\0'; fout << entry.base << '\0' << entry.info << '\0'; fout << entry.compound << '\n'; + /* make padding */ + if (fout.tellp() % 2) { + fout << '\0'; + } + if (fout.fail()) return -1; else @@ -131,36 +138,32 @@ std::ostream& fout; }; -#define bytecpy(dist, src) \ -{ memcpy(dist, &(src), sizeof(src)); (dist) += sizeof(src); } - class LexFile { public: LexFile(std::ostream& fout = std::cout) : fout(fout) {}; - inline long write(size_t len, std::vector& lex_data) { + inline long write(size_t len, std::vector& lex_data) { long pos = fout.tellp(); fout << (unsigned char)len; fout << (unsigned char)lex_data.size(); - for (std::vector::iterator i = lex_data.begin(); + for (std::vector::iterator i = lex_data.begin(); i != lex_data.end(); i++) { - fout.write(*i, 12); + fout.write((char*)(*i), sizeof(da_lex_t)); } if (fout.fail()) return -1; else return pos; } - inline static char* pack(char* str, + inline static da_lex_t* pack(da_lex_t* lex, const Entry& entry, const long dat_index) { - char *head = str; - bytecpy(str, entry.posid); - bytecpy(str, entry.inf_type); - bytecpy(str, entry.inf_form); - bytecpy(str, entry.weight); - bytecpy(str, entry.con_tbl); - bytecpy(str, dat_index); + lex->posid = entry.posid; + lex->inf_type = entry.inf_type; + lex->inf_form = entry.inf_form; + lex->weight = entry.weight; + lex->con_tbl = entry.con_tbl; + lex->dat_index = dat_index; - return head; + return lex; } private: std::ostream& fout; @@ -197,13 +200,13 @@ DataFile datfile(datastream); LexFile lexfile(lexstream); - typedef std::multimap Hash; + typedef std::multimap Hash; typedef Hash::value_type HashVal; Hash entries; while (intfile.getentry(entry)) { long da_index = datfile.write(entry); - char* buf = new char[12]; + da_lex_t* buf = new da_lex_t; entries.insert(HashVal(entry.form, LexFile::pack(buf, entry, da_index))); } @@ -215,7 +218,7 @@ size_t* lens = new size_t[entries.size()]; int* vals = new int[entries.size()]; - std::vector lex_data; + std::vector lex_data; i = entries.begin(); while (i != entries.end()) { const std::string& key = i->first;