Skip to content

Commit

Permalink
Merge pull request #10659 from stevengj/utf8proc-new
Browse files Browse the repository at this point in the history
update utf8proc, replace wcwidth
  • Loading branch information
stevengj committed Mar 30, 2015
2 parents ba22a70 + 58578b0 commit 92538cf
Show file tree
Hide file tree
Showing 20 changed files with 99 additions and 417 deletions.
6 changes: 3 additions & 3 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,6 @@
[submodule "deps/openspecfun"]
path = deps/openspecfun
url = git://github.com/JuliaLang/openspecfun.git
[submodule "deps/libmojibake"]
path = deps/libmojibake
url = git://github.com/JuliaLang/libmojibake.git
[submodule "deps/utf8proc"]
path = deps/utf8proc
url = git://github.com/JuliaLang/utf8proc.git
14 changes: 6 additions & 8 deletions Make.inc
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ USE_SYSTEM_SUITESPARSE=0
USE_SYSTEM_RMATH=0
USE_SYSTEM_LIBUV=0
USE_SYSTEM_UTF8PROC=0
USE_SYSTEM_MOJIBAKE=0
USE_SYSTEM_UTF8PROC=0
USE_SYSTEM_LIBGIT2=0
USE_SYSTEM_PATCHELF=0

Expand Down Expand Up @@ -605,14 +605,12 @@ else
LIBUV_INC = $(JULIAHOME)/deps/libuv/include
endif

ifeq ($(USE_SYSTEM_MOJIBAKE), 1)
LIBMOJIBAKE = -lmojibake
ifeq ($(USE_SYSTEM_UTF8PROC), 1)
LIBUTF8PROC = -lutf8proc
UTF8PROC_INC = /usr/include
else
ifeq ($(USE_SYSTEM_UTF8PROC), 1)
LIBMOJIBAKE = -lutf8proc
else
LIBMOJIBAKE = $(build_libdir)/libmojibake.a
endif
LIBUTF8PROC = $(build_libdir)/libutf8proc.a
UTF8PROC_INC = $(JULIAHOME)/deps/utf8proc
endif

# OS specific stuff
Expand Down
26 changes: 17 additions & 9 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,21 @@ Library improvements

* Add sparse least squares to ``\`` by adding ``qrfact`` for sparse matrices based on the SPQR library. ([#10180])

* String improvements

* `graphemes(s)` returns an iterator over grapheme substrings of `s` ([#9261]).

* Character predicates such as `islower()`, `isspace()`, etc. use
utf8proc to provide uniform cross-platform behavior and
up-to-date, locale-independent support for Unicode standards
([#5939]).

* `reverseind` function to convert indices in reversed strings (e.g. from
reversed regex searches) to indices in the original string ([#9249]).

* `charwidth(c)` and `strwidth(s)` now return up-to-date cross-platform
results (via utf8proc) ([#10659]): Julia now likes pizza ([#3721]).

* Other improvements

* `gc_enable`, `gc_disable` returns previous GC state.
Expand Down Expand Up @@ -178,15 +193,6 @@ Library improvements

* Efficient `mean` and `median` for ranges ([#8089]).

* `graphemes(s)` returns an iterator over grapheme substrings of `s` ([#9261]).

* Character predicates such as `islower()`, `isspace()`, etc. use utf8proc/libmojibake
to provide uniform cross-platform behavior and up-to-date, locale-independent support
for Unicode standards ([#5939]).

* `reverseind` function to convert indices in reversed strings (e.g. from
reversed regex searches) to indices in the original string ([#9249]).

* New `Nullable` type for missing data ([#8152]).

* `deepcopy` recurses through immutable types and makes copies of their mutable fields ([#8560]).
Expand Down Expand Up @@ -1084,6 +1090,7 @@ Too numerous to mention.
[#3688]: https://github.com/JuliaLang/julia/issues/3688
[#3697]: https://github.com/JuliaLang/julia/issues/3697
[#3719]: https://github.com/JuliaLang/julia/issues/3719
[#3721]: https://github.com/JuliaLang/julia/issues/3721
[#3737]: https://github.com/JuliaLang/julia/issues/3737
[#3759]: https://github.com/JuliaLang/julia/issues/3759
[#3790]: https://github.com/JuliaLang/julia/issues/3790
Expand Down Expand Up @@ -1313,3 +1320,4 @@ Too numerous to mention.
[#10446]: https://github.com/JuliaLang/julia/issues/10446
[#10458]: https://github.com/JuliaLang/julia/issues/10458
[#10543]: https://github.com/JuliaLang/julia/issues/10543
[#10659]: https://github.com/JuliaLang/julia/issues/10659
1 change: 0 additions & 1 deletion base/string.jl
Original file line number Diff line number Diff line change
Expand Up @@ -541,7 +541,6 @@ startswith(a::Array{UInt8,1}, b::Array{UInt8,1}) =

## character column width function ##

charwidth(c::Char) = max(0,Int(ccall(:wcwidth, Int32, (UInt32,), c)))
strwidth(s::AbstractString) = (w=0; for c in s; w += charwidth(c); end; w)
strwidth(s::ByteString) = Int(ccall(:u8_strwidth, Csize_t, (Ptr{UInt8},), s.data))
# TODO: implement and use u8_strnwidth that takes a length argument
Expand Down
17 changes: 10 additions & 7 deletions base/utf8proc.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@ import Base: show, showcompact, ==, hash, string, symbol, isless, length, eltype
export isgraphemebreak

# also exported by Base:
export normalize_string, graphemes, is_valid_char, is_assigned_char,
export normalize_string, graphemes, is_valid_char, is_assigned_char, charwidth,
islower, isupper, isalpha, isdigit, isnumber, isalnum,
iscntrl, ispunct, isspace, isprint, isgraph, isblank

# whether codepoints are valid Unicode
is_valid_char(c) = (0x0 <= c <= 0x110000) && Bool(ccall(:utf8proc_codepoint_valid, Cuchar, (Int32,), c))
is_valid_char(c::Char) = is_valid_char(UInt32(c))
is_valid_char(c::Union(UInt8,UInt16,UInt32,Char)) = Bool(ccall(:utf8proc_codepoint_valid, Cuchar, (UInt32,), c))
is_valid_char(c::Integer) = (0x0 <= c <= 0x110000) && is_valid_char(UInt32(c))

# utf8 category constants
const UTF8PROC_CATEGORY_CN = 0
Expand Down Expand Up @@ -116,10 +116,13 @@ end

############################################################################

# returns UTF8PROC_CATEGORY code in 1:30 giving Unicode category
charwidth(c::Char) = Int(ccall(:utf8proc_charwidth, Cint, (UInt32,), c))

############################################################################

# returns UTF8PROC_CATEGORY code in 0:30 giving Unicode category
function category_code(c)
UInt32(c) > 0x10FFFF && return 0x0000 # see utf8proc_get_property docs
return unsafe_load(ccall(:utf8proc_get_property, Ptr{UInt16}, (Int32,), c))
return ccall(:utf8proc_category, Cint, (UInt32,), c)
end

is_assigned_char(c) = category_code(c) != UTF8PROC_CATEGORY_CN
Expand Down Expand Up @@ -176,7 +179,7 @@ end
# iterators for grapheme segmentation

isgraphemebreak(c1::Char, c2::Char) =
ccall(:utf8proc_grapheme_break, Bool, (Char, Char), c1, c2)
ccall(:utf8proc_grapheme_break, Bool, (UInt32, UInt32), c1, c2)

immutable GraphemeIterator{S<:AbstractString}
s::S # original string (for generation of SubStrings)
Expand Down
8 changes: 4 additions & 4 deletions contrib/windows/msys_build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -180,9 +180,9 @@ echo 'override LIBLAPACKNAME = $(LIBBLASNAME)' >> Make.user
# Remaining dependencies:
# libuv since its static lib is no longer included in the binaries
# openlibm since we need it as a static library to work properly
# mojibake since its headers are not in the binary download
# utf8proc since its headers are not in the binary download
echo 'override STAGE1_DEPS = uv' >> Make.user
echo 'override STAGE2_DEPS = mojibake' >> Make.user
echo 'override STAGE2_DEPS = utf8proc' >> Make.user
echo 'override STAGE3_DEPS = ' >> Make.user
make -C deps get-uv

Expand All @@ -197,8 +197,8 @@ if [ -n "$USEMSVC" ]; then
# Since we don't have a static library for openlibm
echo 'override UNTRUSTED_SYSTEM_LIBM = 0' >> Make.user

# Compile libuv and mojibake without -TP first, then add -TP
make -C deps install-uv install-mojibake
# Compile libuv and utf8proc without -TP first, then add -TP
make -C deps install-uv install-utf8proc
cp usr/lib/uv.lib usr/lib/libuv.a
echo 'override CC += -TP' >> Make.user
else
Expand Down
62 changes: 31 additions & 31 deletions deps/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@ include $(JULIAHOME)/Make.inc
# if you are adding a new target, it can help to copy an similar, existing target
#
# autoconf configure-driven scripts: llvm pcre arpack fftw unwind gmp mpfr patchelf uv
# custom Makefile rules: openlibm Rmath-julia dsfmt suitesparse-wrapper suitesparse lapack openblas mojibake objconv
# custom Makefile rules: openlibm Rmath-julia dsfmt suitesparse-wrapper suitesparse lapack openblas utf8proc objconv
# CMake libs: libgit2
#
# downloaded from git: llvm-svn, uv, libopenlibm, mojibake, openspecfun
# downloaded from git: llvm-svn, uv, libopenlibm, utf8proc, openspecfun
#
# there are rules in this file with the . replaced by a %
# this is some magic Makefile trick that tells make
Expand Down Expand Up @@ -150,8 +150,8 @@ ifeq ($(USE_SYSTEM_SUITESPARSE), 0)
STAGE2_DEPS += suitesparse
endif

ifeq ($(USE_SYSTEM_MOJIBAKE), 0)
STAGE2_DEPS += mojibake
ifeq ($(USE_SYSTEM_UTF8PROC), 0)
STAGE2_DEPS += utf8proc
endif

# Only compile standalone LAPACK if we are not using OpenBLAS.
Expand Down Expand Up @@ -183,7 +183,7 @@ install: $(addprefix install-, $(DEP_LIBS))
cleanall: $(addprefix clean-, $(DEP_LIBS))
distcleanall: $(addprefix distclean-, $(DEP_LIBS))
rm -rf $(build_prefix)
getall: get-llvm get-uv get-pcre get-openlibm get-openspecfun get-dsfmt get-Rmath-julia get-openblas get-lapack get-fftw get-suitesparse get-arpack get-unwind get-osxunwind get-gmp get-mpfr get-patchelf get-mojibake get-virtualenv get-objconv get-libgit2
getall: get-llvm get-uv get-pcre get-openlibm get-openspecfun get-dsfmt get-Rmath-julia get-openblas get-lapack get-fftw get-suitesparse get-arpack get-unwind get-osxunwind get-gmp get-mpfr get-patchelf get-utf8proc get-virtualenv get-objconv get-libgit2

## PATHS ##
# sort is used to remove potential duplicates
Expand Down Expand Up @@ -1347,47 +1347,47 @@ compile-fftw-double: $(FFTW_DOUBLE_OBJ_TARGET)
check-fftw-double: fftw-$(FFTW_VER)-double/checked
install-fftw-double: $(FFTW_DOUBLE_OBJ_TARGET)

## MOJIBAKE ##
## UTF8PROC ##

MOJIBAKE_SRC_TARGET = libmojibake/libmojibake.a
MOJIBAKE_OBJ_LIB = $(build_libdir)/libmojibake.a
MOJIBAKE_OBJ_HEADER = $(build_includedir)/mojibake.h
MOJIBAKE_OBJ_TARGET = $(MOJIBAKE_OBJ_LIB) $(MOJIBAKE_OBJ_HEADER)
UTF8PROC_SRC_TARGET = utf8proc/libutf8proc.a
UTF8PROC_OBJ_LIB = $(build_libdir)/libutf8proc.a
UTF8PROC_OBJ_HEADER = $(build_includedir)/utf8proc.h
UTF8PROC_OBJ_TARGET = $(UTF8PROC_OBJ_LIB) $(UTF8PROC_OBJ_HEADER)


libmojibake/Makefile:
utf8proc/Makefile:
(cd .. && git submodule init && git submodule update)
ifeq (exists, $(shell [ -d libmojibake/.git ] && echo exists ))
$(MOJIBAKE_SRC_TARGET): libmojibake/.git/HEAD
ifeq (exists, $(shell [ -d utf8proc/.git ] && echo exists ))
$(UTF8PROC_SRC_TARGET): utf8proc/.git/HEAD
endif
ifeq (exists, $(shell [ -d $(JULIAHOME)/.git/modules/deps/libmojibake ] && echo exists ))
$(MOJIBAKE_SRC_TARGET): $(JULIAHOME)/.git/modules/deps/libmojibake/HEAD
ifeq (exists, $(shell [ -d $(JULIAHOME)/.git/modules/deps/utf8proc ] && echo exists ))
$(UTF8PROC_SRC_TARGET): $(JULIAHOME)/.git/modules/deps/utf8proc/HEAD
endif
$(MOJIBAKE_SRC_TARGET): libmojibake/Makefile
$(MAKE) -C libmojibake cc="$(CC) -O2 -std=c99 $(fPIC) -DMOJIBAKE_EXPORTS" AR="$(AR)" libmojibake.a
$(UTF8PROC_SRC_TARGET): utf8proc/Makefile
$(MAKE) -C utf8proc cc="$(CC) -O2 -std=c99 $(fPIC) -DUTF8PROC_EXPORTS" AR="$(AR)" libutf8proc.a
touch -c $@
libmojibake/checked: $(MOJIBAKE_SRC_TARGET)
utf8proc/checked: $(UTF8PROC_SRC_TARGET)
ifeq ($(OS),$(BUILD_OS))
-$(MAKE) -C libmojibake check
-$(MAKE) -C utf8proc check
endif
echo 1 > $@

$(MOJIBAKE_OBJ_LIB): $(MOJIBAKE_SRC_TARGET)
$(UTF8PROC_OBJ_LIB): $(UTF8PROC_SRC_TARGET)
cp -f $< $@

$(MOJIBAKE_OBJ_HEADER): libmojibake/Makefile
cp -f libmojibake/mojibake.h $@
$(UTF8PROC_OBJ_HEADER): utf8proc/Makefile
cp -f utf8proc/utf8proc.h $@

clean-mojibake:
-$(MAKE) -C libmojibake clean
-rm -rf $(build_libdir)/libmojibake.a $(build_includedir)/mojibake.h
distclean-mojibake: clean-mojibake
clean-utf8proc:
-$(MAKE) -C utf8proc clean
-rm -rf $(build_libdir)/libutf8proc.a $(build_includedir)/utf8proc.h
distclean-utf8proc: clean-utf8proc

get-mojibake: libmojibake/Makefile
configure-mojibake: get-mojibake
compile-mojibake: $(MOJIBAKE_SRC_TARGET)
check-mojibake: libmojibake/checked
install-mojibake: $(MOJIBAKE_OBJ_TARGET)
get-utf8proc: utf8proc/Makefile
configure-utf8proc: get-utf8proc
compile-utf8proc: $(UTF8PROC_SRC_TARGET)
check-utf8proc: utf8proc/checked
install-utf8proc: $(UTF8PROC_OBJ_TARGET)

## SUITESPARSE ##

Expand Down
1 change: 0 additions & 1 deletion deps/libmojibake
Submodule libmojibake deleted from 86447a
1 change: 1 addition & 0 deletions deps/utf8proc
Submodule utf8proc added at e1fdad
2 changes: 1 addition & 1 deletion src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ ifeq ($(USE_LLVM_SHLIB),1)
LLVMLINK = $(call exec,$(LLVM_CONFIG) --ldflags) -lLLVM-$(call exec,$(LLVM_CONFIG) --version)
endif

COMMON_LIBS = -L$(build_shlibdir) -L$(build_libdir) $(LIBUV) $(LIBMOJIBAKE) $(NO_WHOLE_ARCHIVE) $(LLVMLINK) $(OSLIBS)
COMMON_LIBS = -L$(build_shlibdir) -L$(build_libdir) $(LIBUV) $(LIBUTF8PROC) $(NO_WHOLE_ARCHIVE) $(LLVMLINK) $(OSLIBS)
DEBUG_LIBS = $(WHOLE_ARCHIVE) $(BUILDDIR)/flisp/libflisp-debug.a $(WHOLE_ARCHIVE) $(BUILDDIR)/support/libsupport-debug.a $(COMMON_LIBS)
RELEASE_LIBS = $(WHOLE_ARCHIVE) $(BUILDDIR)/flisp/libflisp.a $(WHOLE_ARCHIVE) $(BUILDDIR)/support/libsupport.a $(COMMON_LIBS)

Expand Down
4 changes: 2 additions & 2 deletions src/flisp/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,11 @@ HEADERS = $(wildcard *.h) $(LIBUV_INC)/uv.h
OBJS = $(SRCS:%.c=$(BUILDDIR)/%.o)
DOBJS = $(SRCS:%.c=$(BUILDDIR)/%.dbg.obj)
LLTDIR = ../support
LLT = $(BUILDDIR)/$(LLTDIR)/libsupport.a $(LIBUV) $(LIBMOJIBAKE)
LLT = $(BUILDDIR)/$(LLTDIR)/libsupport.a $(LIBUV) $(LIBUTF8PROC)

FLAGS = -I$(LLTDIR) $(CFLAGS) $(HFILEDIRS:%=-I%) \
-I$(LIBUV_INC) -I$(build_includedir) $(LIBDIRS:%=-L%) \
-DLIBRARY_EXPORTS -DMOJIBAKE_EXPORTS
-DLIBRARY_EXPORTS -DUTF8PROC_EXPORTS
ifneq ($(USEMSVC), 1)
FLAGS += -Wall -Wno-strict-aliasing -DUSE_COMPUTED_GOTO -fvisibility=hidden
endif
Expand Down
10 changes: 5 additions & 5 deletions src/flisp/Windows.mk
Original file line number Diff line number Diff line change
Expand Up @@ -29,17 +29,17 @@ OBJECTS = \
dirname.obj

LIBUV = $(MAKEDIR)\..\..\deps\libuv\libuv.lib
LIBMOJIBAKE = $(MAKEDIR)\..\..\deps\libmojibake\libmojibake.lib
LIBUTF8PROC = $(MAKEDIR)\..\..\deps\libutf8proc\libutf8proc.lib
LIBSUPPORT = $(MAKEDIR)\..\support\libsupport.lib

INCLUDE = $(INCLUDE);$(MAKEDIR)\..\..\deps\libuv\include;$(MAKEDIR)\..\..\deps\libmojibake;$(MAKEDIR)\..\support
INCLUDE = $(INCLUDE);$(MAKEDIR)\..\..\deps\libuv\include;$(MAKEDIR)\..\..\deps\libutf8proc;$(MAKEDIR)\..\support

CFLAGS = $(CFLAGS) /Qstd=c99 -D_CRT_SECURE_NO_WARNINGS -DLIBRARY_EXPORTS
LFLAGS = $(LFLAGS) kernel32.lib ws2_32.lib psapi.lib advapi32.lib iphlpapi.lib

default: $(NAME).exe

$(NAME).exe: lib$(NAME).lib flmain.obj $(LIBSUPPORT) $(LIBUV) $(LIBMOJIBAKE)
$(NAME).exe: lib$(NAME).lib flmain.obj $(LIBSUPPORT) $(LIBUV) $(LIBUTF8PROC)
$(LINK) $(LFLAGS) /OUT:$(NAME).exe /PDB:$(NAME).pdb /MAP $**

$(LIBSUPPORT):
Expand All @@ -48,8 +48,8 @@ $(LIBSUPPORT):
$(LIBUV):
PUSHD $(MAKEDIR)\..\..\deps\libuv && $(MAKE) /NOLOGO /F Windows.mk && POPD

$(LIBMOJIBAKE):
PUSHD $(MAKEDIR)\..\..\deps\libmojibake && cl -nologo /c utf8proc.c && $(AR) /OUT:libmojibake.lib utf8proc.obj && POPD
$(LIBUTF8PROC):
PUSHD $(MAKEDIR)\..\..\deps\libutf8proc && cl -nologo /c utf8proc.c && $(AR) /OUT:libutf8proc.lib utf8proc.obj && POPD

lib$(NAME).lib: $(OBJECTS)
$(AR) /OUT:lib$(NAME).lib $**
Expand Down
5 changes: 4 additions & 1 deletion src/flisp/julia_extensions.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,11 @@
#include <stdio.h>
#include <string.h>
#include <assert.h>

#include "utf8proc.h"
#undef DLLEXPORT /* avoid conflicting definition */

#include "flisp.h"
#include "mojibake.h"

#ifdef __cplusplus
extern "C" {
Expand Down
12 changes: 5 additions & 7 deletions src/flisp/string.c
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,14 @@
#include <errno.h>

#include "flisp.h"

#if !defined(_OS_WINDOWS_)
#include <sys/time.h>
#endif /* !_OS_WINDOWS_ */

#undef DLLEXPORT /* avoid conflicting definition */
#include "utf8proc.h"

#ifdef __cplusplus
extern "C" {
#endif
Expand Down Expand Up @@ -53,19 +57,13 @@ value_t fl_string_count(value_t *args, u_int32_t nargs)
return size_wrap(u8_charnum(str+start, stop-start));
}

#if defined(_OS_WINDOWS_)
extern int wcwidth(uint32_t c);
#elif defined(_OS_LINUX_)
extern int wcwidth(wchar_t c);
#endif

value_t fl_string_width(value_t *args, u_int32_t nargs)
{
argcount("string.width", nargs, 1);
if (iscprim(args[0])) {
cprim_t *cp = (cprim_t*)ptr(args[0]);
if (cp_class(cp) == wchartype) {
int w = wcwidth(*(uint32_t*)cp_data(cp));
int w = utf8proc_charwidth(*(uint32_t*)cp_data(cp));
if (w < 0)
return FL_F;
return fixnum(w);
Expand Down
1 change: 0 additions & 1 deletion src/julia.expmap
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@
rl_clear_input;
save_arg_area_loc;
u8_*;
wcwidth;
uv_*;
add_library_mapping;
utf8proc_*;
Expand Down
Loading

0 comments on commit 92538cf

Please sign in to comment.