%%% ====================================================================
%%%  @LaTeX3-article{ LaTeX3-VT15-02,
%%%  filename        = "vt15d02.tex",
%%%  archived        = "ctan:/tex-archive/info/ltx3pub/",
%%%  author          = "Bernard Gaulle",
%%%  doc-group       = "Volunteer task VT15 about multilingual
%%%                     documents",
%%%  title           = "Requirements in multilingual documents:
%%%                     Part I Definitions",
%%%  version         = "1.01",
%%%  date            = "22 September 1993",
%%%  time            = "11:49:47 GMT",
%%%  status          = "public, contributed.",
%%%  author-email    = "gaulle@circe.fr",
%%%  author-address  = "CIRCE-CNRS         \\
%%%                     BP 167              \\
%%%                     F-91403 Orsay Cedex  \\
%%%                     France",
%%%  abstract        = "This article suggests basic definitions that
%%%                     will be used by the working group",
%%%  keywords        = "language, dialect, multilingual",
%%%  project-address = "LaTeX3 Project            \\
%%%                     c/o Dr. Chris Rowley      \\
%%%                     The Open University       \\
%%%                     Parsifal College          \\
%%%                     Finchley Road             \\
%%%                     London NW3 7BG, England, UK",
%%%  project-tel     = "+44 171 794 0575",
%%%  project-FAX     = "+44 171 433 6196",
%%%  project-email   = "LTX3-Mgr@SHSU.edu",
%%%  copyright       = "Copyright (C) 1993 LaTeX3 Project
%%%                     and Bernard GAULLE.
%%%                     All rights reserved.
%%%
%%%                     Permission is granted to make and distribute
%%%                     verbatim copies of this publication or of
%%%                     coherent parts from this publication provided
%%%                     this copyright notice and this permission
%%%                     notice are preserved on all copies.
%%%
%%%                     Permission is granted to copy and distribute
%%%                     translations of this publication or of
%%%                     individual items from this publication into
%%%                     another language provided that the translation
%%%                     is approved by the original copyright holders.
%%%
%%%                     No other permissions to copy or distribute this
%%%                     publication in any form are granted and in
%%%                     particular no permission to copy parts of it
%%%                     in such a way as to materially change its
%%%                     meaning.",
%%%  generalinfo     = "To subscribe to the LaTeX3 discussion list:
%%%
%%%                      Send mail to listserv@vm.urz.uni-heidelberg.de
%%%                      with the following line as the body of the
%%%                      message (substituting your own name):
%%%
%%%                        subscribe LaTeX-L First-name Surname
%%%
%%%                     To find out about volunteer work:
%%%
%%%                      look at the document vol-task.tex which can
%%%                      be obtained electronically, see below.
%%%
%%%                     To retrieve project publications electronically:
%%%
%%%                      Project publications are available for
%%%                      retrieval by anonymous ftp from ctan hosts:
%%%                          ftp.tex.ac.uk
%%%                          ftp.dante.de
%%%                          ftp.shsu.edu
%%%                      in the directory /tex-archive/info/ltx3pub.
%%%
%%%                      The file ltx3pub.bib in that directory gives
%%%                      full bibliographical information including
%%%                      abstracts in BibTeX format.  A brief history
%%%                      of the project and a description of its aims
%%%                      is contained in l3d001.tex.
%%%
%%%                     If you only have access to email, and not ftp
%%%                      You may use the ftpmail service.
%%%                      Send a message just containg the word
%%%                          help
%%%                      to ftpmail@ftp.shsu.edu
%%%                      for more information about this service.
%%%
%%%                     For offers of financial contributions or
%%%                      contributions of computing equipment or
%%%                      software, contact the project at the above
%%%                      address, or the TeX Users Group.
%%%
%%%                     For offers of technical assistance, contact the
%%%                      project at the above address.
%%%
%%%                     For technical enquiries and suggestions, send
%%%                      e-mail to the latex-l list or contact the
%%%                      project at the above address.",
%%%  checksum        = "40358 1136 6542 47979",
%%%  docstring       = "The checksum field above contains a CRC-16
%%%                     checksum as the first value, followed by the
%%%                     equivalent of the standard UNIX wc (word
%%%                     count) utility output of lines, words, and
%%%                     characters.  This is produced by Robert
%%%                     Solovay's checksum utility.",
%%%  }
%%% ====================================================================

\documentstyle[draft,bnf]{l3ms001}
\newcounter{languageitems}
\begin{document}
\title{Requirements in multi-lingual environments \\
       Part I  Definitions}
\author{Coordinator: Bernard \sc{Gaulle}}
\date{22 September 1993 \\
      Version 1.01}
\maketitle
\section{What is a language?}

When we want to print a language we would like that \LaTeX{} apply
automatically all typographic rules used usually in the countries
speaking that language. Often, different countries speak the same
language (or nearly the same) but have different typographic rules.
Sometimes, languages are slightly or even deeply different and we say
that they are dialects.

As we can't imagine all typographic rules required to process
correctly any language, we can ---in a first approach--- look at the
basic control sequences provided by a standard \TeX{} ``motor''.

\subsection*{Standard \TeX{} mechanisms for language processing}

Generally speaking, all \TeX{} control sequences have been designed to
bring a specific typographic feature but only few of them characterize
a language.  For example, \verb|\par| is a basic control sequence that
has the same meaning in all languages (not necessarily the same
processing): basically end a paragraph\footnote{We simplify a lot
  here, intensionaly.  The accuracy is not fully requested to
  understand the proposed discussion.}.  Reversely, there are
parameters, counters or other control sequences that have different
values or contents from one language to another.  Here are the most
important ones :

\begin{description}
\item[]\verb|\language| [primitive integer parameter]
  is an integer which gives the sequence number of the
  hyphenation pattern table to use then. When the current language
  differ from   \verb|\language| a ``whatsit node'' specifying the
  current language is inserted.
  Notice that no change is done (i.e. no language whatsit inserted) in
  ``inner hmode''; if \verb|\language<0| or \verb|\language>255|,
  then 0 is used internally.
 As pointed by Frank Mittelbach,
  this name is really inappropriate and should probably be renamed
  something like this clearer name: \verb|\hyphenpatternnumber|.
 Value from 0 to 255, so up to
  256 ``languages''.

\item[]\verb|\setlanguage| [primitive command]
  activates an hyphenation pattern file (inserting the whatsit)
  without changing the value of \verb|\language|.  Very restricted use
  (really useful in inner hmode, see \verb|\language|).


\item[]\verb|\patterns| [primitive command]
  allows to introduce in \TeX{} memory related to
  languages an hyphenation
  pattern file associated with the current language number. (up to 256
  files).
  Control sequence only allowed at \verb|initex| time for {\em format}
  creation.
\item[]\verb|\hyphenation| [primitive command]
  allows \TeX{} to learn about a list of words to
  be hyphenated differently from the usual \TeX{} algorithm in
  the current  language. This list can
  be introduced in \TeX{} memory at \verb|initex| time or later and can
  be modified or completed at any time.

\item[]\verb|\lefthyphenmin| \& \verb|\righthyphenmin|
  [primitive integer parameters]
  are numbers specifying
  the amount of characters starting and ending a word that must be never
  hyphenated with the {\em current} language. These values are not saved
  in the \TeX{} memory related to languages.

\item[]\verb|\uchhyph| [primitive integer parameter]
  specifies whether words written with one or more upper
  case letters are candidates or not for hyphenation.
  (defaultly \verb|plain| \TeX{}
  and \verb|lplain| allow hyphenation of words with capital letters).
  This value is not saved in the \TeX{} memory related to languages.

\item[]\verb|\discretionary| [primitive command]
  permits to specify how a character sequence
  must be hyphenated, providing the {\em pre-break text} with its
  discretionary hyphen and the {\em post-break text}.
  It inhibits hyphenation in the rest (before and after the
  discretionary) of the word.
\verb|\-| is the abbreviation of the
  most common case of discretionary break e.g.
  \verb|\discretionary{-}{}{}|.
  This is a setting inserted in a word, for that word, at input time. So
  it can't be saved any way.

\item[]\verb|\hyphenchar| [primitive command]
  is the hyphen of a font,
  can be coded to point out the hyphen character to use
  for a font. \verb|\hyphenchar\font=-1| suppress all hyphenation for
  the current font. Notice that \verb|\hyphenchar\font=`\-|
  is the common way to
  provide a default hyphen character for the current font. There is a
  default hyphen character that is set in \verb|plain| \TeX{} for all
  fonts, this is: \verb|\defaulthyphenchar=`\-|.
  \verb|\defaulthyphenchar| is the default
  hyphen to be used, if \verb|\hyphenchar| isn't set;
  The `-' character as hyphen character is hard-wired in some other
  commands (e.g. \verb|\hyphenation|).
  Notice that \verb|\hyphenchar| changes are global.
  These values are not saved in the \TeX{} memory related to languages.

\item[]\verb|\hyphenpenalty| \& \verb|\exhyphenpenalty|
  [primitive integer parameters]
  are penalties
  at a discretionary break. First one is used if the {\em pre-text} is
  not empty otherwise it is the second value that is used.
  These values are not saved in the \TeX{} memory related to languages.

\item[]\verb|\tolerance| \& \verb|\pretolerance|
  [primitive integer parameters]
  are values saying ``how much''
  the \TeX{} hyphenation algorithm must be severe or not.
  They have to be adjusted with the line size.
  These values are not saved in the \TeX{} memory related to languages.

NB: \verb|\hyphenpenalty10000|
  \verb|\exhyphenpenalty10000|  \verb|\pretolerance10000| \\
  can be introduced when we need no hyphenation at all
  (\TeX book page 394).

\item[]\verb|\lccode| [primitive command]
  tells \TeX{} which is the lower-case character of a
  specific one but telling ``0'' instructs \TeX{} that no hyphenation
  can occur around that character (whatever font is in use).
  This value is not saved in the \TeX{} memory related to languages.

%As we discuss about \verb|\lccode| we must say few words about the 2
%following items:
%\begin{itemize}
\item[]\verb|\uccode| [primitive command] is not exactly the opposite of
  \verb|\lccode|; it only instructs \TeX{} which is the upper-case
  character
  of a specific one. This primitive is deeply related to the font
  encoding which will be discussed later in this chapter.
  This value is not saved in the \TeX{} memory related to languages.

\item[]\verb|\sfcode| [primitive command] gives a {\em space factor} to
  a specific character. It is usually used to add stretchability to
  punctuation marks typed in the text
  (primitive used inside the well known \verb|\frenchspacing| macro;
  which name is improper...).
  This is typically a typographic feature.
  This value is not saved in the \TeX{} memory related to languages.
%\end{itemize}

\item[]\verb|\allowhyphens| [macro]
  allows the following word to be hyphenated
(appendix D: Dirty Tricks of the \TeX book p 394).

\item[]\verb|\doublehyphendemerits| \& \verb|\finalhyphendemerits|
  [primitive integer parameters]
  are values of demerits assessed to \TeX{}
  when breaking a paragraph into lines. These
  values take their effect when two consecutive lines end with
  discretionary
  breaks or if it is the second-last line of the entire paragraph.
  These values are not saved in the \TeX{} memory related to languages.

\end{description}

There are probably a lot of other control sequences that might be, as
a side effect, considered specific of a language. For example,
consider \verb|\font|; is-it really language dependent? It's obvious
that some languages use specific fonts but we can't consider that
these fonts are a characteristic of specific languages because they
can be used for other purposes than writing a given language and that
often we have a choice between several fonts when we want to write in
that given language.  Therefore we can say that a language isn't
characterized by the fonts used for writing a document in that
language.

\subsection*{Other mechanisms for language processing}

There are also other {\em non-standard\/} mechanisms for language
processing, I think in particular to the direction of writing.

\begin{description}
\item[]\verb|\beginR \endR| and \verb|\beginL \endL| used with a {\tt
  TeX--XeT} motor offer the opportunity to write from right to left and
  from left to right.
\end{description}

There is certainly something similar with \TeX{} motors used in Japan...
to say whether the first direction of writing is from top to bottom or
not.

{\em\% Don't hesitate to send me proposals to complete this part...}

\bigskip
Originally, Ml\TeX{} was the first \TeX{} extension to write
multi-lingual
documents. Is any control sequence of Ml\TeX{} still language specific
nowadays? We have in fact two cases for consideration:
\begin{enumerate}
\item When the user has 8 bit fonts (assuming the appropriate encoding
      i.e. the Cork standard) he don't need the feature supplied by
      Ml\TeX{} which allows words with diacritics to be  correctly
      hyphenated.
\item When the user has only 7 bit fonts and use frequently  a language
      having a lot of diacritics, then he can request the
      \verb|\charsubdef|
      feature of Ml\TeX{} which allows to input 8 bit characters,
      hyphenate words and
      substitute the corresponding pair: accent macro and  letter of
      the 7 bit font for printing.
\end{enumerate}
This feature is no more the only way to achieve hyphenation of words
with diacritics. With the virtual fonts we can do that processing,
even we use 7 bit real fonts. As we imagine that in a short next
future people will use either DC/EC fonts either VF fonts, it would be
useless to keep this feature in mind when creating a multi-lingual
standard for the next decade.

\subsection*{The character set, filters and font encoding}

There are, at least, 3 character sets: one used by the user to
produce his \verb|.tex| file, one which is understood by \LaTeX{} and
one used to output the document.

\subsubsection*{The input character set}

Each language has his own character set but does it mean that the
printed output depends of a specific character set? It's sure that
usually we use, for example, the string ``YES'' to print YES, but
there are a lot of other ways to print the same word, the simpliest
being a macro like \verb|\YA| \ldots Obviously different languages
have the same character set.

With Personal Computer (PC) running DOS, for example, the user can
specify his country ``code-page'' and then the \TeX{} motor has to be
adapted to the input encoding. The feature proposed by em\TeX{} is
based on filters which can be customized by the user. We think that it
is the most efficient technic to process a linguistic character set.

%%%The relationships between the input character set and the printed
%%%glyphs are not always obvious.

The input character set is in fact extendible up to the
keyboards limits. It is difficult to imagine a user inputing
 a bilingual document with two keyboards! So:

\begin{quote}
\em The character set used to input a multilingual document is text and
user dependent, but not language dependent.
\end{quote}

Reversely, a language can't be characterized (i.e. defined) by a
specific character set.

\subsubsection*{The \LaTeX{} character set}

Defaultly, the character set used by \LaTeX{} is composed of the 256
chars of \TeX{} itself adapted by the installation to match internally
the ASCII (128 first chars) standard and optionaly define {\em
  local\/} characters.  That way, we can say there is only one \TeX{}
character set in the world (for all languages). As everybody knows,
256 characters are not enough to process complex languages like
Chinese, but this is a \TeX{} feature we can't change\ldots

\subsubsection*{The font character set}

Each font can print individual characters but \TeX{} often use 2 or
more font characters to print a significant piece of text or math. So
the font character set is not the good notion to keep in mind and we
prefer the notion of {\em font encoding}. NFSS2 brings 5 classes of
encoding: text, math letter, math symbol, math extended and other
(unknown encoding).

We have already seen that \verb|\lccode| of characters and capital
letters are important data for hyphenation processing; do we need to
specify which are the lower-case and upper-case letters of each
character and for each language?  We are speaking here in fact about
the default font encoding which process the \verb|\uppercase| and
\verb|\lowercase| mechanism. Printout is extremely dependent of that
encoding. The EC text encoding (T1 inside NFSS2) defined at Cork in
1990 was designed particularly for European languages; it remains a
little useless for languages like US-English. Thus some people will
use it and other will not.  The font encoding is an important
information to be known by each language specific coding (style) to be
able to print the appropriate glyphs. Therefore we suggest that:

\begin{quote}
  \em \LaTeX{} V3 will take account, for each language, of the font
  encoding scheme to use.
\end{quote}

We think that the ISO coding is not of real interest because \TeX{}
fonts which can contain 256 characters are basically more efficient
and also are, de facto, a standard. We know that the font encoding is
not fully coherent for the CM fonts (OT1, OM1, etc. inside NFSS2) but
many installations are using them and will still use them for a long
time. So this information about the font encoding scheme must be
available for the language processing code.

Due to this high level notion of font encoding we don't need at all,
now, the sub-mechanisms of \verb|\lccode| and \verb|\uccode| to teach
\LaTeX{} which are the uppercase or lowercase letters of a specific
one.

\bigskip
As noticed by Bernd Raichle:
\begin{quote}
\begin{verbatim}
TeX does some things wrong, because it mixes between the input
character encoding and font position encoding in `hmode' (e.g.
a token `A' with \catcode 11 (= input encoding) is the same as
\`\A (= font encoding) and produces the character in
position 65 of the current font. In `mmode' you can distinguish
between input and font encoding using \mathcode (and \mathchar)
and the something similar should exist for the normal text mode.

Because of this mixing, the \lccode array is used for
\lowercase (= input character coding) and for the hyphenation
process (= font encoding, because an `hlist' consists of
font/char pairs). For this reason, \lowercase/\uppercase can't
be used without restrictions for lowercasing/uppercasing text
in the output.
\end{verbatim}
\end{quote}

\subsection*{Some classification}

The previous survey shows us that we have handled only three
kinds of process:
\begin{enumerate}
\item[I] hyphenation (standard \TeX{})
\item[II] direction of writing (\TeX{} extensions)
\item[III] font encoding  (as in NFSS2)
\end{enumerate}

Now let's try to classify the hyphenation part to find the appropriate
items that are characteristic of a language. Firstly we can say that
something is characteristic of a language when we can keep it in
\TeX{} mind and when it doesn't depend only of user's actions at input
time.  Thus we eliminate the following control sequences:
\verb|\setlanguage|, \verb|\discretionary|, \verb|\hyphenpenalty|,
$\!$\verb|\exhyphenpenalty|, $\!$\verb|\pretolerance|,
\verb|\tolerance|, \verb|\allowhyphens|, \verb|\doublehyphendemerits|
and finally \verb|\finalhyphendemerits|.

Regarding \verb|\hyphenchar| the problem is that it is font dependent.
As already explained, a font isn't a characteristic of a language in
itself.  A font is generally used by various languages and at least by
dialects.

\bigskip
What is remaining?
\begin{enumerate}
\item\verb|\language|, an integer from 0 to 255.
\item\verb|\patterns|, a list of {\em patterns}, i.e. strings of
   characters and numbers. May be large or not,
   depending of the language structure. We must keep in mind that this
   list is a {\em one time} definition (\verb|initex|).
\item\verb|\hyphenation|, a list of words specially hyphenated. Normally
   reduced; various lists, like US one, are becoming really long.
\item\verb|\lefthyphenmin|,   an integer from 1 to n.
\item\verb|\righthyphenmin|,  an integer from 1 to n.
\item\verb|\uchyph|, an integer being 1 or -1.
\item\verb|\lccode|, a list of characters.

As already said, we reject \verb|\uccode| which is in fact a conponent
of the font encoding. The primitive \verb|\sfcode| is also eliminated
because it isn't fully related to a language, this is a typographic
feature that might be easily introduced in any language style file.

\setcounter{languageitems}{\value{enumi}}
\end{enumerate}

About the  direction of writing, we think that we can't take in
consideration that standard \TeX{} has no facilities provided to the
user because it would eliminate a lot of languages of our planet. So, a
language must also be defined by:
\begin{enumerate}
\setcounter{enumi}{\value{languageitems}}
\item The {\bf first} direction of writing i.e. top-down (TD) or
      bottom-up (BU) or
      left to right (LR) or right to left (RL).
\item The {\bf second} direction of writing, complementary from
      the first one i.e. if the first one is TD then the second
      can be either LR or RL.
\setcounter{languageitems}{\value{enumi}}
\end{enumerate}
And as we already stated:
\begin{enumerate}
\setcounter{enumi}{\value{languageitems}}
\item The font encoding scheme identifier used for text (OT1, T1, ...)
 as defined with NFSS2.
\setcounter{languageitems}{\value{enumi}}
\end{enumerate}

\subsection*{Other components of a language?}

Of course, there are other components, we can list:
\begin{itemize}
\item {\bf Typographic rules} applied in each linguistic country, one
    is, for example: {\em the title caption of a table is printed before
    it in French} or {\em the first paragraph in a section is never
    indented in US-English}. (Will be deeply discussed in next parts).
  Notice, as written by Klaus Lagally, that: the general document
  layout is related to, but neither determined nor specific to the
  default language.  The structure of a paper in Arabic, Persian, and
  Urdu is basically the same.

\item {\bf Linguistic titles} used at the high level language  of
      \LaTeX{}, we think to the translation of ``table of contents'' or
      the dates. (Will be deeply discussed in next parts).
\item {\bf Country abbreviations} which are as much typographic as
      linguistic, for example: {\em Mister is abbreviated Mr in English}
      or {\em primo is abbreviated 1$^o$ in French}.
      (Will be deeply discussed in next parts).
\end{itemize}
Are they specific to a language? Perhaps but we are unable to have a
complete overview of these peculiarities. It can't be formulated in
terms of a value or even a list of strings. Also there are a lot of
countries which have no official standard and therefore any definition
would be seen greatly wrong by a lot of people.

So, we rather prefer to let these topics for the language-style
 definition level
which should be treated by the appropriate linguistic groups.

At this step of language definition it seems appropriate to associate
a {\em style-file-name} for further linguistic peculiarities.  So a
language is also defined by:
\begin{enumerate}
\setcounter{enumi}{\value{languageitems}}
\item The \LaTeX{} style file to apply each time the corresponding
language is in effect. This is a name which is optional. If omitted a
default file name
($\NonTerminal\right.${\it language}$\left.\endNonTerminal$%
\verb|.sty|) could be searched for. We will come back about this default
 in the next paragraph.
\setcounter{languageitems}{\value{enumi}}
\end{enumerate}


\subsection*{The language definition}

As already stated, a language ---in \TeX--- is defined by its internal
number. Everybody prefer to manipulate languages by their names.
Obviously people writing English documents don't use all the same
English language, accordingly all English languages printed in the
world have not the same \TeX{} internal number, and it's more true for
other languages.  Thus it is better to give a symbolic name to each
language.

We suggest that the default language style file name for a language
were the language name itself (shortened to 8 chars, plus
\verb|.sty|). What happens if this file is not present on the system?
We suggest that in that specific case a generic style file called
---for example--- \verb|language.sty| were loaded (we will see later
exactly when) in order to establish the appropriate relationships,
mechanisms, default values, etc. This style file could look like a
skeleton for other linguistic style files but could also be the
default language style file of \LaTeX{} itself when no language is
specified by the user at any time.

Then the suggestion proposed to the \LaTeX{} V3 team is to
implement a language definition that could look like this one:

\bigskip
({\em assignments undefined here can be found in the \TeX book}).

\bigskip\noindent
$\begin{bnf}<language assignment> -> \end{bnf}$\verb|\languagedef|
$\begin{bnf}
<control sequence><language general text> \\
<language general text> -> <left brace><language balanced text>%
                           <right brace>\\
<language balanced text> ->  <language number>%
\end{bnf}$\verb|,|\\
$\begin{bnf} \phantom{<language balanced text> -->}%
               \!<patterns general text>
\end{bnf}$\verb|,|\\
$\begin{bnf} \phantom{<language balanced text> -->}%
               \!<hyphenation general text>
\end{bnf}$\verb|,|\\
$\begin{bnf}
\phantom{<language balanced text> -->}\!<lefthyphenmin integer>
\end{bnf}$\verb|,|\\
$\begin{bnf}
\phantom{<language balanced text> -->}\!<righthyphenmin integer>
\end{bnf}$\verb|,|\\
$\begin{bnf}
\phantom{<language balanced text> -->}\!<uchyph integer>
\end{bnf}$\verb|,|\\
$\begin{bnf}
\phantom{<language balanced text> -->}%
               \!<left brace>
                 <zero lccode characters><right brace>
\end{bnf}$\verb|,|\\
$\begin{bnf}
\phantom{<language balanced text> -->}\!<writing directions>
\end{bnf}$\verb|,|\\
$\begin{bnf}
\phantom{<language balanced text> -->}\!<final language general text>\\
<final language general text> -> <font encoding name>\\
\phantom{<final language general text> -->}\!
                          | <font encoding name>\end{bnf}$\verb|,|
                          $\begin{bnf} <language style file name>\\
<language number> -> <number>\quad\%\enspace being [1- 255]\\
<patterns general text> -> <general text> \quad\%\enspace
                               \mbox{\TeX book} \enspace p\enspace 453\\
<hyphenation general text> -> <general text> \quad\%\enspace idem\\
<lefthyphenmin integer> -> <number>\\
<righthyphenmin integer> -> <number>\\
<uchyph integer> -> <number> \quad\%\enspace
                             being\enspace either\enspace 1
                             \enspace or\enspace -1\\
<zero lccode characters> -> <character> | <zero lccode characters>\\
<writing directions> -> <horizontal writing>
                     \end{bnf}$\verb|,|$\begin{bnf}<vertical writing>\\
\phantom{<writing directions> -->}\!\! |<vertical writing>
                    \end{bnf}$\verb|,|$\begin{bnf}<horizontal writing>\\
<horizontal writing> ->
              \end{bnf}$\verb|LR|$\begin{bnf} |\end{bnf}$\verb|RL|\\
$\begin{bnf}
<vertical writing> ->
              \end{bnf}$\verb|TD|$\begin{bnf} |\end{bnf}$\verb|BU|\\
$\begin{bnf}
<font encoding name> -> <general text>\quad\%\enspace
                               cf\enspace nfss2\enspace
                        text\enspace font \enspace encoding.\\
<language style file name> -> <filename>
\end{bnf}${\space\space\em \% Default being the name of the }\\
$\begin{bnf}
\phantom{<language style file name> -->}\! |<filename>
\end{bnf}$\verb|.sty |{\em \% }
$\begin{bnf} <control sequence>\end{bnf}${\em above.}

\section{What is a dialect?}

Generally speaking a dialect is a language that differs from its parent
(or twin) language at one or more levels:
\begin{itemize}
\item pronunciation,
\item meaning of words,
\item phrase syntax,
\item use of verbs,
\item \ldots
\end{itemize}

Of course, a dialect has his own typographic requirements. Due to
pronunciation differencies, for example, hyphenation may differ.
Patterns file is often different because ancient words that might be
no more used in its parent language, could be frequently used in a
dialect.  Typesetting of dates, formating of titles, sectioning, etc.
may differ, depending of country habits or legal rules or historical
events.

At a generic level, we don't see something commonly specific in all
written dialects that need to be defined differently from a language.
A dialect is also a word that is a little pejorative.

Therefore we think that there is no real advantage to offer any
command like \verb|\dialectdef| which would bring nothing different
than \verb|\languagedef|. So we suggest that:

\begin{quote}
\em A dialect, inside \LaTeX{}, is just like a language.
\end{quote}

\section{Area of language application}

\subsection*{Where a language apply?}

Basically, Donald Knuth defined on which shortest entity a language
could be applied: ``\TeX{} is able to work with several languages in the
same paragraph.'' Saying that we can really switch to another language
anywhere in a paragraph. Does our \verb|\languagedef| parameters apply
anywhere?

In fact, it may exist parameters which apply to a whole paragraph
(\verb|uchyph| ?) or even on a larger entity like the page. Obviously,
the language style file can also have effects at specific boundaries.
Let's imagine a style that would require each page were printed with a
dropped initial letter. This kind of effect has to be typeset.  If the
typesetting has already started with another style (of another
language) we can't produce this facility in that new language before
the next page boundary arise. So we can switch to any language at any
time and anywhere but some feature may be postponed after a specific
boundary, or even not applied at all if the current environment is not
appropriate.

\subsection*{How the user switchs from one language to another?}

Assuming we have defined {\em language1\/} and {\em language2\/} via
the appropriate commands \verb|\languagedef\|{\em
  language1\/}\verb|{...}| and \verb|\languagedef\|{\em
  language2\/}\verb|{...}| respectively, we can imagine to type
\verb|\|{\em language1\/} to switch to {\em language1\/} and then
provide all typesetting facilities given in that language (idem with
{\em language2\/}).

Well, what about the \TeX{} grouping mechanism? We have here three
test cases to consider:

\begin{description}
\item{Test 1:} \verb|\|{\em language1\/} ... text1
               \verb|{\|{\em language2\/} ... text2\verb|}|
                ... TEST1 ...

                ``text1'' will be printed as in {\em language1\/},
                ``text2'' accordingly to {\em language2\/} and
                ``TEST1''? Normally ``TEST1'' has to be printed as in
                {\em language1\/} but it will be done only if:

\begin{quote}
\em A language definition is local.
\end{quote}

This is an important rule. Does it mean that no global definition is
allowed to be issued at the language switch time? No, it only means
that no linguistic feature is allowed to be applied globally
(\verb|\global| defs or \verb|\xdef|).

\item{Test 2:} \verb|\|{\em language1\/} ... text1
               \verb|{\|{\em language2\/} ... TEST2\verb|}|

               ``TEST2'' will be printed as in {\em language2\/} but
               also ---probably--- as in {\em language1\/} if an
               elaborated {\em language1\/} style file is in use.
               Why?

Obviously, the two languages can have different typesetting or formating
technics.  Let's imagine that the first one never print blank spaces,
i.e. words are never separated by a space.  Then there is a good chance
that ``TEST2'' were printed with that feature due to the lack
of explicit order to disactivate the first and it may happen that the
second language doesn't matter with the {\em nospace} feature.

Let's suppose now that the language switching mechanism has been
 designed so that:
when there is a call to another language,
then the former facilities or features
are stopped immediately.  That is the choice we show here by the mean of
the \verb|\switchto| command:

\item{Test 3:} \verb|\switchto\|{\em language1\/} ... text1
               \verb|{\switchto\|{\em language2\/} ... text2\verb|}|
 ... TEST3 ...

 {\em language1\/} is stopped when \verb|\|{\em language2\/} is
 executed. ``text2'' is printed as in {\em language2\/}. Since {\em
   language2\/} is enclosed within \verb|{| and \verb|}| it has not
 effect outside the braces. So we can't decide which is the language
 to apply to ``TEST3''! In reality, it should be the default \TeX{}
 language number 0, but that's all we can say...
\end{description}

We suggest another way, in 2 parts, which is a kind of compromise:
\begin{enumerate}
\item A complete language environment.

\verb|\begin{language}[|{\em language1\/}\verb|]|\\
\hspace*{2em} text1\\
\verb|\end{language}|\\
\verb|\begin{language}[|{\em language2\/}\verb|]|\\
\hspace*{2em} text2\\
\verb|\end{language}|\\
TEST3

That way, {\em language1\/} is ended explicitly. ``TEST3'' is printed
as in the default language and there is no possibility that ``text2''
were printed with some features of {\em language1\/}. It implies that:

\begin{quote}
  \em Two \verb|\begin{language}| must be separated by one
  \verb|\end{language}|.
\end{quote}

This rule is really a great constraint when we need to print few words
in another language, so we suggest also the following for short
inserts:

\item A temporary language switch.

\verb|\begin{language}[|{\em language1\/}\verb|]|\\
\hspace*{2em} text1\\
\verb|{\|{\em language2\/} text2\verb|}|\\
\hspace*{2em} text1 continued\\
\verb|\end{language}|\\
TEST3

In this scheme, {\em language1\/} application is just deferred when
{\em language2\/} is called. ``text2'' is printed as in {\em
  language2\/} and after the closing brace {\em language1\/} is
automatically enabled again.

\begin{quote}
\em
At one time there is at most two activated languages  and only one
enabled.
\end{quote}
 This will reduce efficiently the memory usage and avoid complexity.
\end{enumerate}

People will have also the possibility to use the shortest form of
language switching:

\medskip%\noindent
\verb|\|{\em language1\/} text1 \verb|\|{\em language2\/} text2 \verb|\|
{\em language1\/} text3

\medskip
\noindent but this way people must be warned to avoid surrounding
braces, as we stated previously, for the third test case. This
unstructured form should be unused if we want to be respectful of the
structured standards like SGML.  But we strongly suggest to the
\LaTeX{} V3 team that this feature were still available for its
simplicity and for upwards compatibility.

We wrongly said that there was only two languages at the same time
because, in fact, there is always the default language. So the loaded
languages might be, at most, up to three at the same time.

\subsection*{What is the default language?}

We suggest first that the default language would be an arbitrary
language, artificial i.e. specially created for only one purpose: be
recognizable (a \verb|\languagedef| without any realistic interest).
This default \LaTeX{} language would normally never be used. It could
be the language number zero.  \LaTeX{} could check frequently, for
example at each beginning of environment in case of the language name
were wrongly become the default language and then inform the user.

Secondly, we suggest that the user has the possibility to chose his own
default language for all his documents. This language will be called the
{\em main language}.
The internal language number
one could be reserved for that usage, letting the installator the
possibility to customize it (we will propose a specific way in a next
chapter).

These two proposals will bring an important drawback: a few
incompatibility with \LaTeX{} V2 documents. Since \TeX{} V3 origin (in
1989), language number zero was dedicated to US-English. So any
document in English could be typeset without any change all around the
world. It was really a great facility for few people having to
exchange their documents with foreign colleagues but we are convinced
that it doesn't concern the great majority of \LaTeX{} end users. In
fact there are two points to discuss here:

\begin{itemize}
\item {\bf language number 0} was the default language, it might have
  been tested by any code to check if US-English was activated. We
  think that this case is extremely rare now. The original German
  style was assuming that language zero was US-English but all other
  multilingual mechanisms (Babel, French) didn't make this assumption.

\item {\bf US-English} was the default language. \LaTeX{} was designed
  for \TeX{} V2 and so used the only one language provided with
  \verb|plain| \TeX{}: US-English. Nothing was changed about this
  topic when \LaTeX{} moved to \TeX{} V3. The major drawback of having
  US-English as default language was that people thought there was no
  alternatives. Therefore \LaTeX{} was known to be a typesetting tool
  for Americans and more largely for English-speaking users. In our
  opinion \LaTeX{} worldwide expansion suffered this default value and
  we think that it would be extremely fruitful that the version 3 were
  not so linked to any concrete language. We will propose instead a
  compatibility option to let this former default active.
\end{itemize}

\subsection*{A language option with \mediumseries{\tt\char'134%
            begin\char'173 document\char'175}}

It would be useful to add an {\em optional}
 parameter to \verb|\begin{document}|
in order to allow the user to specify the language to use for the whole
document. This method may look simplier for the user:

\bigskip\noindent
\verb|\begin{document}[mainlanguage=|{\em any language}\verb|]|

\bigskip\noindent
as opposed to:

\bigskip\noindent
\verb|\begin{document}|\\
\verb|\begin{language}[|{\em any language}\verb|]|\\
\verb|...|\\
\verb|\end{language}|

\bigskip
In a multi-user installation it may occur that no user default language
can be chosen. Therefore it will be absolutely necessary to specify a
language when beginning a document; this way is really easy.

It's now becoming obvious to say that:

\begin{quote}
  \em Language(s) application in \LaTeX{} is bounded inside
  \verb|\begin{document}| and \verb|\end{document}|.
\end{quote}

This rule will raise the problem of processing the auxiliary files
which are ordinary read at the begining and at the end of the
document; we will discuss this point later in part III.

We all know that there are in \LaTeX{} V2 few facilities to typeset
many documents at a time; the letter style offers this facility. So we
suggest that the same kind of language specification apply to such
sub-documents:

\bigskip\noindent
\verb|\begin{letter}[|{\em any language}\verb|]|\\
\verb|...|\\
\verb|\end{letter}|

\section{What is a multi-lingual document?}

OThe input file did not have the correct format.
It should contain a line which has the word "checksum"
and no other alphabetic characters.
bviously a multilingual document is a collection of texts in
different languages. The length of texts is a determinant point of
multilingualism complexity. When the only foreign words used in a
document are the authors names cited in reference, we can say that
such document is the less multilingual one. In the other side, an
example like an EEC document, in 9 languages, can be evaluated as
deeply multilingual but in fact each language is entirely disconnected
from each other: this is only a translation in different languages of
the same text, each translated part having to fit the same area in the
page when typeset in columns. This example is really a medium
multilingual document because the typographic rules required to format
the pages are not specific of any language but they obey to the EEC
page layout. The most complex multilingual document is probably made
of pages printed according to several languages, one page obeying to
typographic rules of one language and another page typeset in another
language. Such document is not really usual. More realisticly and
frequently we can see documents typeset according to typographic rules
of one language but with some inserts (often quotations) in other
languages.

We will try to list in a next chapter all typographic mechanisms that
would be necessary to implement the almost multilingual facilities
required in the world. One of these is for example the ability to
simply typeset a table in which each row is printed in a different
language.

To simplify at most we can say that each \LaTeX{} document printing
mathematics is, de facto, a multi-lingual document.  Notice (as
pointed by Klaus Lagally) that a mathematic formula is not a sequence
of character glyphs, but a picture.  Maths can be printed within text
or displayed, anywhere, requiring specific typographic rules, etc.
just like a language. More really, there are even slight differencies
between linguistic countries to print math, so we are suggesting that:

\begin{quote}
\em With \LaTeX{} V3, each document is multi-lingual.
\end{quote}

This rule will allow to apply linguistic and typographic modifications
to a document using only one language. A real multi-lingual document,
i.e.  using more that one language, is not toomuch more complex: the
only difference concerns only the internal language switching
mechanism that we will discuss in the next part.

\section{Summary about language levels}

As you probably notice we distinguished 5 levels of language
application:

\begin{enumerate}
\item \LaTeX{} internals (the default language).
\item Main document (user default language or main language).
\item Long parts (\verb|\begin{language}...\end{language}|).
\item Short inserts (\verb|{\|{\em language}\verb|...}|).
\item Mathematics.
\end{enumerate}

In order to allow that the linguistic codes run efficiently in any of
these levels we recommand that:

\begin{quote}
\em
\LaTeX{} V3 will keep in mind what level of language application is
actually running, anywhere during the typesetting of the document.
\end{quote}

(notice that, probably, the information will not always come from
\LaTeX{} but also from the linguistic style.)

%\section{More comments about Part I}
%\bibliographystyle{plain}
\begin{thebibliography}{10}
\bibitem{tub:JBr93}
Johannes Braams.
\newblock {\em Babel, a multilingual style-option  system
          for use with \LaTeX's standard document styles.}
\newblock {\em {TUG}boat}, 12, no. 2, 1991, pp. 291-301.

\bibitem{tub:JBr93b}
Johannes Braams.
\newblock {\em An update on the {\tt babel} system.}
\newblock {\em {TUG}boat}, 14, no. 1, April 1993, pp. 60-62.

\def\Accent^\TeX{T\kern-.1667em\lower.5ex\hbox{\^E}\kern-.125emX}
\bibitem{f1:MF85} Michael J. Ferguson.
\newblock {\em A Multilingual \Accent^\TeX.}
\newblock {\em {TUG}boat}, 6, no. 2,  1985, pp. 57-59.

\bibitem{f1:MF87} Michael J. Ferguson.
\newblock {\em A Multilingual \Accent^\TeX}
\newblock {Rapport technique de l'INRS-T\'el\'ecomunications}, 87-23,
 1987.

\bibitem{gut1:YH92} M. Fanton \& Y. Haralambous, Conference:
\newblock \TeX{} et les langues Orientales
\newblock december 1992, to appear in the {\em Cahiers GUTenberg}.

\bibitem{Kluwer:EvHe90}
Eric van Herwijnen.
\newblock {\em Practical {SGML}}.
\newblock Wolters Kluwer Academic Publishers, Dordrecht, 1990.

\bibitem{A-W:DKn91}
Donald~E. Knuth.
\newblock Computers \& {T}ypesetting / A. {\em The \TeX book}.
\newblock Addison-Wesley, Reading, Massachusetts, 1991.

\bibitem{KD:daly93}
H. Kopka \& P.W. Daly
\newblock {\em A guide to \LaTeX.}
\newblock Addison-Wesley, 1993.

\bibitem{A-W:LLa86}
Leslie Lamport.
\newblock {\em {\LaTeX:} A Document Preparation System}.
\newblock Addison-Wesley, Reading, Massachusetts, 1986.

\bibitem{cahgut:MS92}
Frank Mittelbach \& Rainer Sch{\"o}pf.
\newblock {\LaTeX} V3 conference in Paris,
\newblock june 1992.

\bibitem{cahgut:MS90}
Frank Mittelbach and Rainer Sch{\"o}pf.
\newblock {\em {\LaTeX} dans les ann\'{e}es 90.}
\newblock {\em Cahiers {GUT}enberg}, (6):2--14, July 1990.

\bibitem{ast93:LS93}
Laurent Siebenmann.
\newblock{\em A format Compilation Framework for European Languages}.
\newblock {\em {TUG}boat}, 14, no. 2, 1993, to appear.

\bibitem{cpc:RWo90}
Reinhard Wonneberger.
\newblock {\em Structured document processing: the {\LaTeX} approach.}
\newblock {\em Computer Physics Communications}, (61):177--189, 1990.

\end{thebibliography}

\newpage
\vfill
\def\contentsname{Contents of Part I: Definitions}
\tableofcontents
\vfill

\section*{To appear}
{\small\begin{verbatim}
Part II, LANGUAGE HANDLING
==========================

Section 1: Hyphenation loading mechanism
Section 2: Hyphenation exceptions handling
Section 3: Related file names standard
Section 4: Switching mechanism
\end{verbatim}

\begin{verbatim}
Part III, TYPOGRAPHY HANDLING MECHANISMS
========================================
\end{verbatim}

\begin{verbatim}
What are the required mechanisms to apply typographic features for
various (numerous) languages? For example we don't know when it is
required to apply the french guillemets BUT we must know that there
are cases where specific characters (here the french guillemets) need
to be inserted in front of each line and at \everypar.
\end{verbatim}

\begin{verbatim}
Section 1: Line level
Section 2: Paragraph level
Section 3: Page level
Section 4: Column level
Section 5: Table row and column levels
Section 6: Two-sides sheet level
Section 7: Math levels (text and display)
Section 8: Sectioning level as well as start/end of documents
Section 9: \shipout level
\end{verbatim}

\begin{verbatim}
Part IV, OTHER LANGUAGE REQS
============================

This part will address features not processed in previous parts and
not allready solved in the current LaTeX version or via appropriate
style files.
So it isn't exhaustive.

Are they specific mechanism required to process the following:
Section 1: Bibliography
Section 2: Index
Section 3: Glossary
Section 4: Typographic abbreviations
Section 5: Country Fonts
Section 6: Table of..., abstracts, summaries, keywords,...
Section 7: ??

  Thanks for your effort to cooperate,
  --bg
\end{verbatim}}

\end{document}