\documentclass[landscape]{slides}
\textheight=1.2\textheight
\oddsidemargin=0cm
\topmargin=-2.0cm
\usepackage{color}
\usepackage{graphicx}

\definecolor{backgroundcolor}{rgb}{.04,.08,.14}
\definecolor{textcolor}{rgb}{.96,.96,1}
\definecolor{headcolor}{rgb}{.96,.77,.15}

\usepackage{url}

\newcommand{\slt}[1]{\centering\textcolor{headcolor}{\Large\textbf{#1}}\hbox{}\vfill\raggedright\vspace{-1ex}\par}
\newcommand{\prog}[1]{\texttt{#1}}
\newlength\contboxwidth
\newcommand{\cont}{\setbox0=\hbox{\small\space(continued)}%
	\contboxwidth=\wd0\box0\hspace{-\contboxwidth}}
\newcommand{\visa}{\setbox0=\hbox{\small\space no visa}%
	\contboxwidth=\wd0\box0\hspace{-\contboxwidth}}
\def\abovecaptionskip{5ex}
\newcommand{\fa}[1]{\mbox{\Large\textfarsi{#1}}}

\newcommand{\unicode}[1]{{U+#1}}
\newcommand{\uniname}[1]{\emph{#1}}
\newcommand{\uni}[2]{\unicode{#1} \uniname{#2}}
\newcommand{\uniseq}[1]{$\langle$#1$\rangle$}
\newcommand{\unirange}[2]{\uniseq{\unicode{#1}..\unicode{#2}}}

\let\saveendslide=\endslide
\let\saveendoverlay=\endoverlay
\renewcommand{\endslide}{\vfill\saveendslide}
\renewcommand{\endoverlay}{\vfill\saveendoverlay}
\let\saveslide=\slide
\renewcommand{\slide}{\saveslide\flushright}
\let\saveoverlay=\overlay
\renewcommand{\overlay}{\saveoverlay\flushright}

\color{textcolor}
\begin{document}
\color{textcolor}
\pagecolor{backgroundcolor}
\large

\slide
\thispagestyle{empty}
\centering
\textcolor{headcolor}{\Huge\bfseries
\begin{tabular}{c}
	Bloat of Data\\[.3em]{\Large in the}\\[.3em]Unicode Era
\end{tabular}}\par
{
\includegraphics[height=5ex]{ddc.png}
\vfill\textcolor{textcolor}{\Large
\vspace{-1ex}
$\stackrel%
{\mbox{Behdad Esfahbod}}%
{\mbox{\large\texttt{behdad@behdad.org}}}
$
\\[2ex]
$\stackrel%
{\mbox{The FarsiWeb Project}}%
{\mbox{\large\texttt{http://www.farsiweb.info/}}}$
}}\par
\vspace{-1ex}
% -- Or presenation date?
{
\large Desktop Developers' Conference '05\\July 18, 2005}
% --
\endslide

\slide
\slt{Agenda}
\begin{itemize}
\item The Importance of Unicode
\item Unicode Character Database
\item Common Locale Data Repository
\item Future Plan
\end{itemize}
\endslide

\slide
\slt{The Importance of Unicode}
\begin{itemize}
\item \textbf{The Old Days$^{\mbox{\scriptsize TM}}$:} Gazillions of 8-bit
character sets
\item \textbf{ISO~10646:} A unified character set
\item \textbf{The Unicode Standard:} And unified algorithms to deal with
these unified character set
\end{itemize}
\endslide

\slide
\slt{Intro to Unicode}
\begin{itemize}
\item Currently at 4.1.0 release
\item Not 16-bit, 21-bit ($16+\log_2 17 = 20.087462841250339$-bit)
\item A unique non-negative integer less than 1,114,112 assigned to each
character
\end{itemize}
\endslide

\slide
\slt{Intro to Unicode\cont}
\begin{itemize}
\item Slightly less than 100,000 characters registered so far
\item New scripts and characters are encoded with each release
\item Major releases published as a book, with online updates for minor
releases
\end{itemize}
\endslide

\slide
\slt{Intro to Unicode\cont}
\begin{itemize}
\item The book is available online as PDF files
\item The updates and other references are available in plain HTML
\item Data files as text files
\end{itemize}
\endslide

\slide
\slt{Architectural View of Unicode}
\begin{itemize}
\item \textbf{The book:} The Unicode encoding model,
Encoding model and \emph{issues} for individual scripts
\item \textbf{Key specifications:} \emph{Standard Annex}, \emph{Standard
Report}, or \emph{Technical Report}, algorithms for rendering or otherwise dealing
with text
\item \textbf{Data files:} \emph{Unicode Character Database}, text files
that define character properties and internal mappings
\end{itemize}
\endslide

\slide
\slt{Key Specifications}
\begin{itemize}
\item Unicode Collation (UCA)
\item Bidirectional Algorithm (Bidi)
\item Normalization (NFC, NFD, \ldots)
\end{itemize}
\endslide

\slide
\slt{Unicode Character Database}
\begin{itemize}
\item More than 70 character properties
\item The canonical character name, eg. \unicode{0041} is \uniname{LATIN CAPITAL LETTER A}
\item The most commonly used one is the \emph{General Category}, eg.
\unicode{0041} is \emph{Lu}: Letter, upper case
\item Mostly binary and enumerated properties
\end{itemize}
\endslide

\slide
\slt{They Show Up Everywhere}
\begin{itemize}
\item Glibc character types: \texttt{isalpha}, \texttt{isdigit},
\texttt{isprint}, \ldots (\texttt{ctype.h})\\
\textbf{Warning:} The C standard limits the value of some of these functions
\item Convenience and module libraries: Glib has some, Qt's \texttt{QChar} class has
some,
Python's \texttt{unicodedata} module has the important ones, Perl supports
all of them in regular expressions
\end{itemize}
\endslide

\slide
\slt{And in (Some) Applications}
\begin{itemize}
\item Gucharmap uses them of course
\item Terminal emulators use the \texttt{wcwidth} function from Markus Kuhn
\item But not much more
\end{itemize}
\endslide

\slide
\slt{Where Else is it Useful?}
\begin{itemize}
\item I want my editor to show the character names
\item Unicode regular expressions (PCRE)
\item Wherever a list of scripts is useful
\end{itemize}
\endslide

\slide
\slt{The Problem}
\begin{itemize}
\item Glibc is not available everywhere
\item The manual and Perl-script approaches, the 2-year cycle, performance
\item Different versions of the data around
: And old Glibc, Glib, FriBidi in
Pango, wcwidth in gnome-terminal, gucharmap, ...
\end{itemize}
\endslide

\slide
\slt{The Problem\cont}
\begin{itemize}
\item File formats, default values, etc, change.  Can go unnoticed
\item High entry cost for getting the data in your application
\item Support for new scripts is broken for years
\end{itemize}
\endslide

\slide
\slt{Ideally}
\begin{itemize}
\item A new approach to Unicode libraries: Only data, no converters, no
algorithms
\item A central library exporting the UCD efficiently
\item Easier maintenance, easier update
\end{itemize}
\endslide

\slide
\slt{Ideally\cont}
\begin{itemize}
\item Better memory overhead, more sharing
\item Problems in format change, etc have more chance to get noticed
\item Different versions of the UCD can live together (IDN requires 3.2)
\end{itemize}
\endslide

\slide
\slt{Ideally\cont}
\begin{itemize}
\item A runtime library that you can query properties efficiently
\item A development kit that generates efficient lookup-table code for
pedantic projects
\item Central translation effort for property names, script names, character
names, etc
\end{itemize}
\endslide

\slide
\slt{Where are We Now?}
\begin{itemize}
\item Planning
\item Got the name: \textbf{gNUichar}
\item A binding-friendly efficient design
\item Fetch compressor and bits from different projects
\item Release and advertise
\end{itemize}
\endslide

\slide
\slt{Localization}
\begin{itemize}
\item Much trickier than internationalization
\item $O(n^2 + k.n)$ where $n$ is the number of languages and $k$ is the
number of different atoms
\item More exposed to the end user: date formats, number formats, language
names, country names, etc
\end{itemize}
\endslide

\slide
\slt{Locale Data}
\begin{itemize}
\item Glibc has the basic functionality, but very limited
\item Evolution has a handful of date formats to translate, other modules
have too
\item Several projects maintain a list of language names and countries,
that get translated separately
\item Paper sizes, date formats, currency, timezone, etc
\end{itemize}
\endslide

\slide
\slt{The Problem}
\begin{itemize}
\item Again, Glibc is not available everywhere
\item If no Glibc locale, no support
\item Translating country names and language names is quite hard
\item Maintenance is a nightmare
\end{itemize}
\endslide

\slide
\slt{Common Locale Data Repository}
\begin{itemize}
\item A group effort coordinated by the Unicode Consortium
\item Backed by companies like IBM, Sun, Apple, etc
\end{itemize}
\endslide

\slide
\slt{CLDR Architecture}
\begin{itemize}
\item Current version is 1.3.0
\item Released as a set of XML files
\item Using inheritance to reduce the effort
\item XML and file-based inheritance, makes it hard to use
\end{itemize}
\endslide

\slide
\slt{The Problem\cont}
\begin{itemize}
\item The XML architecture makes it pretty hard to use CLDR in an application
\item Overlaps with the Glibc data
\end{itemize}
\endslide

\slide
\slt{Ideally}
\begin{itemize}
\item A central library to export the CLDR efficiently
\item Qt has its own locale system, convert
\item GNOME doesn't have a locale system, push in
\end{itemize}
\endslide

\slide
\slt{Currently}
\begin{itemize}
\item A new list created for discussion, \texttt{locale-list@gnome.org}
\item ICU may be finally useful
\item A long way to go, help needed, in design and implementation
\end{itemize}
\endslide

\slide
\slt{In the Future}
\begin{itemize}
\item Get these two libraries released
\item Build a higher-level locale library for GNOME
\item Start cleaning up GNOME and KDE
\item What else? Questions?
\end{itemize}
\endslide

\end{document}

\slide
\slt{}
\begin{itemize}
\item
\item
\item
\end{itemize}
\endslide


