Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

update utf8proc, replace wcwidth #10659

Merged
merged 1 commit into from
Mar 30, 2015
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions .gitmodules
Original file line number Diff line number Diff line change
Expand Up @@ -7,6 +7,6 @@
[submodule "deps/openspecfun"]
path = deps/openspecfun
url = git://github.com/JuliaLang/openspecfun.git
[submodule "deps/libmojibake"]
path = deps/libmojibake
url = git://github.com/JuliaLang/libmojibake.git
[submodule "deps/utf8proc"]
path = deps/utf8proc
url = git://github.com/JuliaLang/utf8proc.git
14 changes: 6 additions & 8 deletions Make.inc
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,7 @@ USE_SYSTEM_SUITESPARSE=0
USE_SYSTEM_RMATH=0
USE_SYSTEM_LIBUV=0
USE_SYSTEM_UTF8PROC=0
USE_SYSTEM_MOJIBAKE=0
USE_SYSTEM_UTF8PROC=0
USE_SYSTEM_LIBGIT2=0

# Link to the LLVM shared library
Expand Down Expand Up @@ -594,14 +594,12 @@ else
LIBUV_INC = $(JULIAHOME)/deps/libuv/include
endif

ifeq ($(USE_SYSTEM_MOJIBAKE), 1)
LIBMOJIBAKE = -lmojibake
ifeq ($(USE_SYSTEM_UTF8PROC), 1)
LIBUTF8PROC = -lutf8proc
UTF8PROC_INC = /usr/include
else
ifeq ($(USE_SYSTEM_UTF8PROC), 1)
LIBMOJIBAKE = -lutf8proc
else
LIBMOJIBAKE = $(build_libdir)/libmojibake.a
endif
LIBUTF8PROC = $(build_libdir)/libutf8proc.a
UTF8PROC_INC = $(JULIAHOME)/deps/utf8proc
endif

# OS specific stuff
Expand Down
26 changes: 17 additions & 9 deletions NEWS.md
Original file line number Diff line number Diff line change
Expand Up @@ -144,6 +144,21 @@ Library improvements

* Add sparse least squares to ``\`` by adding ``qrfact`` for sparse matrices based on the SPQR library. ([#10180])

* String improvements

* `graphemes(s)` returns an iterator over grapheme substrings of `s` ([#9261]).

* Character predicates such as `islower()`, `isspace()`, etc. use
utf8proc to provide uniform cross-platform behavior and
up-to-date, locale-independent support for Unicode standards
([#5939]).

* `reverseind` function to convert indices in reversed strings (e.g. from
reversed regex searches) to indices in the original string ([#9249]).

* `charwidth(c)` and `strwidth(s)` now return up-to-date cross-platform
results (via utf8proc) ([#10659]): Julia now likes pizza ([#3721]).

* Other improvements

* `gc_enable`, `gc_disable` returns previous GC state.
Expand Down Expand Up @@ -178,15 +193,6 @@ Library improvements

* Efficient `mean` and `median` for ranges ([#8089]).

* `graphemes(s)` returns an iterator over grapheme substrings of `s` ([#9261]).

* Character predicates such as `islower()`, `isspace()`, etc. use utf8proc/libmojibake
to provide uniform cross-platform behavior and up-to-date, locale-independent support
for Unicode standards ([#5939]).

* `reverseind` function to convert indices in reversed strings (e.g. from
reversed regex searches) to indices in the original string ([#9249]).

* New `Nullable` type for missing data ([#8152]).

* `deepcopy` recurses through immutable types and makes copies of their mutable fields ([#8560]).
Expand Down Expand Up @@ -1084,6 +1090,7 @@ Too numerous to mention.
[#3688]: https://github.com/JuliaLang/julia/issues/3688
[#3697]: https://github.com/JuliaLang/julia/issues/3697
[#3719]: https://github.com/JuliaLang/julia/issues/3719
[#3721]: https://github.com/JuliaLang/julia/issues/3721
[#3737]: https://github.com/JuliaLang/julia/issues/3737
[#3759]: https://github.com/JuliaLang/julia/issues/3759
[#3790]: https://github.com/JuliaLang/julia/issues/3790
Expand Down Expand Up @@ -1313,3 +1320,4 @@ Too numerous to mention.
[#10446]: https://github.com/JuliaLang/julia/issues/10446
[#10458]: https://github.com/JuliaLang/julia/issues/10458
[#10543]: https://github.com/JuliaLang/julia/issues/10543
[#10659]: https://github.com/JuliaLang/julia/issues/10659
1 change: 0 additions & 1 deletion base/string.jl
Original file line number Diff line number Diff line change
Expand Up @@ -541,7 +541,6 @@ startswith(a::Array{UInt8,1}, b::Array{UInt8,1}) =

## character column width function ##

charwidth(c::Char) = max(0,Int(ccall(:wcwidth, Int32, (UInt32,), c)))
strwidth(s::AbstractString) = (w=0; for c in s; w += charwidth(c); end; w)
strwidth(s::ByteString) = Int(ccall(:u8_strwidth, Csize_t, (Ptr{UInt8},), s.data))
# TODO: implement and use u8_strnwidth that takes a length argument
Expand Down
17 changes: 10 additions & 7 deletions base/utf8proc.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,13 +6,13 @@ import Base: show, showcompact, ==, hash, string, symbol, isless, length, eltype
export isgraphemebreak

# also exported by Base:
export normalize_string, graphemes, is_valid_char, is_assigned_char,
export normalize_string, graphemes, is_valid_char, is_assigned_char, charwidth,
islower, isupper, isalpha, isdigit, isnumber, isalnum,
iscntrl, ispunct, isspace, isprint, isgraph, isblank

# whether codepoints are valid Unicode
is_valid_char(c) = (0x0 <= c <= 0x110000) && Bool(ccall(:utf8proc_codepoint_valid, Cuchar, (Int32,), c))
is_valid_char(c::Char) = is_valid_char(UInt32(c))
is_valid_char(c::Union(UInt8,UInt16,UInt32,Char)) = Bool(ccall(:utf8proc_codepoint_valid, Cuchar, (UInt32,), c))
is_valid_char(c::Integer) = (0x0 <= c <= 0x110000) && is_valid_char(UInt32(c))

# utf8 category constants
const UTF8PROC_CATEGORY_CN = 0
Expand Down Expand Up @@ -116,10 +116,13 @@ end

############################################################################

# returns UTF8PROC_CATEGORY code in 1:30 giving Unicode category
charwidth(c::Char) = Int(ccall(:utf8proc_charwidth, Cint, (UInt32,), c))

############################################################################

# returns UTF8PROC_CATEGORY code in 0:30 giving Unicode category
function category_code(c)
UInt32(c) > 0x10FFFF && return 0x0000 # see utf8proc_get_property docs
return unsafe_load(ccall(:utf8proc_get_property, Ptr{UInt16}, (Int32,), c))
return ccall(:utf8proc_category, Cint, (UInt32,), c)
end

is_assigned_char(c) = category_code(c) != UTF8PROC_CATEGORY_CN
Expand Down Expand Up @@ -176,7 +179,7 @@ end
# iterators for grapheme segmentation

isgraphemebreak(c1::Char, c2::Char) =
ccall(:utf8proc_grapheme_break, Bool, (Char, Char), c1, c2)
ccall(:utf8proc_grapheme_break, Bool, (UInt32, UInt32), c1, c2)

immutable GraphemeIterator{S<:AbstractString}
s::S # original string (for generation of SubStrings)
Expand Down
8 changes: 4 additions & 4 deletions contrib/windows/msys_build.sh
Original file line number Diff line number Diff line change
Expand Up @@ -180,9 +180,9 @@ echo 'override LIBLAPACKNAME = $(LIBBLASNAME)' >> Make.user
# Remaining dependencies:
# libuv since its static lib is no longer included in the binaries
# openlibm since we need it as a static library to work properly
# mojibake since its headers are not in the binary download
# utf8proc since its headers are not in the binary download
echo 'override STAGE1_DEPS = uv' >> Make.user
echo 'override STAGE2_DEPS = mojibake' >> Make.user
echo 'override STAGE2_DEPS = utf8proc' >> Make.user
echo 'override STAGE3_DEPS = ' >> Make.user
make -C deps get-uv

Expand All @@ -197,8 +197,8 @@ if [ -n "$USEMSVC" ]; then
# Since we don't have a static library for openlibm
echo 'override UNTRUSTED_SYSTEM_LIBM = 0' >> Make.user

# Compile libuv and mojibake without -TP first, then add -TP
make -C deps install-uv install-mojibake
# Compile libuv and utf8proc without -TP first, then add -TP
make -C deps install-uv install-utf8proc
cp usr/lib/uv.lib usr/lib/libuv.a
echo 'override CC += -TP' >> Make.user
else
Expand Down
62 changes: 31 additions & 31 deletions deps/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -11,10 +11,10 @@ include $(JULIAHOME)/Make.inc
# if you are adding a new target, it can help to copy an similar, existing target
#
# autoconf configure-driven scripts: llvm pcre arpack fftw unwind gmp mpfr patchelf uv
# custom Makefile rules: openlibm Rmath-julia dsfmt suitesparse-wrapper suitesparse lapack openblas mojibake objconv
# custom Makefile rules: openlibm Rmath-julia dsfmt suitesparse-wrapper suitesparse lapack openblas utf8proc objconv
# CMake libs: libgit2
#
# downloaded from git: llvm-svn, uv, libopenlibm, mojibake, openspecfun
# downloaded from git: llvm-svn, uv, libopenlibm, utf8proc, openspecfun
#
# there are rules in this file with the . replaced by a %
# this is some magic Makefile trick that tells make
Expand Down Expand Up @@ -150,8 +150,8 @@ ifeq ($(USE_SYSTEM_SUITESPARSE), 0)
STAGE2_DEPS += suitesparse
endif

ifeq ($(USE_SYSTEM_MOJIBAKE), 0)
STAGE2_DEPS += mojibake
ifeq ($(USE_SYSTEM_UTF8PROC), 0)
STAGE2_DEPS += utf8proc
endif

# Only compile standalone LAPACK if we are not using OpenBLAS.
Expand Down Expand Up @@ -183,7 +183,7 @@ install: $(addprefix install-, $(DEP_LIBS))
cleanall: $(addprefix clean-, $(DEP_LIBS))
distcleanall: $(addprefix distclean-, $(DEP_LIBS))
rm -rf $(build_prefix)
getall: get-llvm get-uv get-pcre get-openlibm get-openspecfun get-dsfmt get-Rmath-julia get-openblas get-lapack get-fftw get-suitesparse get-arpack get-unwind get-osxunwind get-gmp get-mpfr get-patchelf get-mojibake get-virtualenv get-objconv get-libgit2
getall: get-llvm get-uv get-pcre get-openlibm get-openspecfun get-dsfmt get-Rmath-julia get-openblas get-lapack get-fftw get-suitesparse get-arpack get-unwind get-osxunwind get-gmp get-mpfr get-patchelf get-utf8proc get-virtualenv get-objconv get-libgit2

## PATHS ##
# sort is used to remove potential duplicates
Expand Down Expand Up @@ -1344,47 +1344,47 @@ compile-fftw-double: $(FFTW_DOUBLE_OBJ_TARGET)
check-fftw-double: fftw-$(FFTW_VER)-double/checked
install-fftw-double: $(FFTW_DOUBLE_OBJ_TARGET)

## MOJIBAKE ##
## UTF8PROC ##

MOJIBAKE_SRC_TARGET = libmojibake/libmojibake.a
MOJIBAKE_OBJ_LIB = $(build_libdir)/libmojibake.a
MOJIBAKE_OBJ_HEADER = $(build_includedir)/mojibake.h
MOJIBAKE_OBJ_TARGET = $(MOJIBAKE_OBJ_LIB) $(MOJIBAKE_OBJ_HEADER)
UTF8PROC_SRC_TARGET = utf8proc/libutf8proc.a
UTF8PROC_OBJ_LIB = $(build_libdir)/libutf8proc.a
UTF8PROC_OBJ_HEADER = $(build_includedir)/utf8proc.h
UTF8PROC_OBJ_TARGET = $(UTF8PROC_OBJ_LIB) $(UTF8PROC_OBJ_HEADER)


libmojibake/Makefile:
utf8proc/Makefile:
(cd .. && git submodule init && git submodule update)
ifeq (exists, $(shell [ -d libmojibake/.git ] && echo exists ))
$(MOJIBAKE_SRC_TARGET): libmojibake/.git/HEAD
ifeq (exists, $(shell [ -d utf8proc/.git ] && echo exists ))
$(UTF8PROC_SRC_TARGET): utf8proc/.git/HEAD
endif
ifeq (exists, $(shell [ -d $(JULIAHOME)/.git/modules/deps/libmojibake ] && echo exists ))
$(MOJIBAKE_SRC_TARGET): $(JULIAHOME)/.git/modules/deps/libmojibake/HEAD
ifeq (exists, $(shell [ -d $(JULIAHOME)/.git/modules/deps/utf8proc ] && echo exists ))
$(UTF8PROC_SRC_TARGET): $(JULIAHOME)/.git/modules/deps/utf8proc/HEAD
endif
$(MOJIBAKE_SRC_TARGET): libmojibake/Makefile
$(MAKE) -C libmojibake cc="$(CC) -O2 -std=c99 $(fPIC) -DMOJIBAKE_EXPORTS" AR="$(AR)" libmojibake.a
$(UTF8PROC_SRC_TARGET): utf8proc/Makefile
$(MAKE) -C utf8proc cc="$(CC) -O2 -std=c99 $(fPIC) -DUTF8PROC_EXPORTS" AR="$(AR)" libutf8proc.a
touch -c $@
libmojibake/checked: $(MOJIBAKE_SRC_TARGET)
utf8proc/checked: $(UTF8PROC_SRC_TARGET)
ifeq ($(OS),$(BUILD_OS))
-$(MAKE) -C libmojibake check
-$(MAKE) -C utf8proc check
endif
echo 1 > $@

$(MOJIBAKE_OBJ_LIB): $(MOJIBAKE_SRC_TARGET)
$(UTF8PROC_OBJ_LIB): $(UTF8PROC_SRC_TARGET)
cp -f $< $@

$(MOJIBAKE_OBJ_HEADER): libmojibake/Makefile
cp -f libmojibake/mojibake.h $@
$(UTF8PROC_OBJ_HEADER): utf8proc/Makefile
cp -f utf8proc/utf8proc.h $@

clean-mojibake:
-$(MAKE) -C libmojibake clean
-rm -rf $(build_libdir)/libmojibake.a $(build_includedir)/mojibake.h
distclean-mojibake: clean-mojibake
clean-utf8proc:
-$(MAKE) -C utf8proc clean
-rm -rf $(build_libdir)/libutf8proc.a $(build_includedir)/utf8proc.h
distclean-utf8proc: clean-utf8proc

get-mojibake: libmojibake/Makefile
configure-mojibake: get-mojibake
compile-mojibake: $(MOJIBAKE_SRC_TARGET)
check-mojibake: libmojibake/checked
install-mojibake: $(MOJIBAKE_OBJ_TARGET)
get-utf8proc: utf8proc/Makefile
configure-utf8proc: get-utf8proc
compile-utf8proc: $(UTF8PROC_SRC_TARGET)
check-utf8proc: utf8proc/checked
install-utf8proc: $(UTF8PROC_OBJ_TARGET)

## SUITESPARSE ##

Expand Down
1 change: 0 additions & 1 deletion deps/libmojibake
Submodule libmojibake deleted from 86447a
1 change: 1 addition & 0 deletions deps/utf8proc
Submodule utf8proc added at e1fdad
2 changes: 1 addition & 1 deletion src/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ ifeq ($(USE_LLVM_SHLIB),1)
LLVMLINK = $(call exec,$(LLVM_CONFIG) --ldflags) -lLLVM-$(call exec,$(LLVM_CONFIG) --version)
endif

COMMON_LIBS = -L$(build_shlibdir) -L$(build_libdir) $(LIBUV) $(LIBMOJIBAKE) $(NO_WHOLE_ARCHIVE) $(LLVMLINK) $(OSLIBS)
COMMON_LIBS = -L$(build_shlibdir) -L$(build_libdir) $(LIBUV) $(LIBUTF8PROC) $(NO_WHOLE_ARCHIVE) $(LLVMLINK) $(OSLIBS)
DEBUG_LIBS = $(WHOLE_ARCHIVE) $(BUILDDIR)/flisp/libflisp-debug.a $(WHOLE_ARCHIVE) $(BUILDDIR)/support/libsupport-debug.a $(COMMON_LIBS)
RELEASE_LIBS = $(WHOLE_ARCHIVE) $(BUILDDIR)/flisp/libflisp.a $(WHOLE_ARCHIVE) $(BUILDDIR)/support/libsupport.a $(COMMON_LIBS)

Expand Down
4 changes: 2 additions & 2 deletions src/flisp/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -22,11 +22,11 @@ HEADERS = $(wildcard *.h) $(LIBUV_INC)/uv.h
OBJS = $(SRCS:%.c=$(BUILDDIR)/%.o)
DOBJS = $(SRCS:%.c=$(BUILDDIR)/%.dbg.obj)
LLTDIR = ../support
LLT = $(BUILDDIR)/$(LLTDIR)/libsupport.a $(LIBUV) $(LIBMOJIBAKE)
LLT = $(BUILDDIR)/$(LLTDIR)/libsupport.a $(LIBUV) $(LIBUTF8PROC)

FLAGS = -I$(LLTDIR) $(CFLAGS) $(HFILEDIRS:%=-I%) \
-I$(LIBUV_INC) -I$(build_includedir) $(LIBDIRS:%=-L%) \
-DLIBRARY_EXPORTS -DMOJIBAKE_EXPORTS
-DLIBRARY_EXPORTS -DUTF8PROC_EXPORTS
ifneq ($(USEMSVC), 1)
FLAGS += -Wall -Wno-strict-aliasing -DUSE_COMPUTED_GOTO -fvisibility=hidden
endif
Expand Down
10 changes: 5 additions & 5 deletions src/flisp/Windows.mk
Original file line number Diff line number Diff line change
Expand Up @@ -29,17 +29,17 @@ OBJECTS = \
dirname.obj

LIBUV = $(MAKEDIR)\..\..\deps\libuv\libuv.lib
LIBMOJIBAKE = $(MAKEDIR)\..\..\deps\libmojibake\libmojibake.lib
LIBUTF8PROC = $(MAKEDIR)\..\..\deps\libutf8proc\libutf8proc.lib
LIBSUPPORT = $(MAKEDIR)\..\support\libsupport.lib

INCLUDE = $(INCLUDE);$(MAKEDIR)\..\..\deps\libuv\include;$(MAKEDIR)\..\..\deps\libmojibake;$(MAKEDIR)\..\support
INCLUDE = $(INCLUDE);$(MAKEDIR)\..\..\deps\libuv\include;$(MAKEDIR)\..\..\deps\libutf8proc;$(MAKEDIR)\..\support

CFLAGS = $(CFLAGS) /Qstd=c99 -D_CRT_SECURE_NO_WARNINGS -DLIBRARY_EXPORTS
LFLAGS = $(LFLAGS) kernel32.lib ws2_32.lib psapi.lib advapi32.lib iphlpapi.lib

default: $(NAME).exe

$(NAME).exe: lib$(NAME).lib flmain.obj $(LIBSUPPORT) $(LIBUV) $(LIBMOJIBAKE)
$(NAME).exe: lib$(NAME).lib flmain.obj $(LIBSUPPORT) $(LIBUV) $(LIBUTF8PROC)
$(LINK) $(LFLAGS) /OUT:$(NAME).exe /PDB:$(NAME).pdb /MAP $**

$(LIBSUPPORT):
Expand All @@ -48,8 +48,8 @@ $(LIBSUPPORT):
$(LIBUV):
PUSHD $(MAKEDIR)\..\..\deps\libuv && $(MAKE) /NOLOGO /F Windows.mk && POPD

$(LIBMOJIBAKE):
PUSHD $(MAKEDIR)\..\..\deps\libmojibake && cl -nologo /c utf8proc.c && $(AR) /OUT:libmojibake.lib utf8proc.obj && POPD
$(LIBUTF8PROC):
PUSHD $(MAKEDIR)\..\..\deps\libutf8proc && cl -nologo /c utf8proc.c && $(AR) /OUT:libutf8proc.lib utf8proc.obj && POPD

lib$(NAME).lib: $(OBJECTS)
$(AR) /OUT:lib$(NAME).lib $**
Expand Down
5 changes: 4 additions & 1 deletion src/flisp/julia_extensions.c
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,11 @@
#include <stdio.h>
#include <string.h>
#include <assert.h>

#include "utf8proc.h"
#undef DLLEXPORT /* avoid conflicting definition */

#include "flisp.h"
#include "mojibake.h"

#ifdef __cplusplus
extern "C" {
Expand Down
12 changes: 5 additions & 7 deletions src/flisp/string.c
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,14 @@
#include <errno.h>

#include "flisp.h"

#if !defined(_OS_WINDOWS_)
#include <sys/time.h>
#endif /* !_OS_WINDOWS_ */

#undef DLLEXPORT /* avoid conflicting definition */
#include "utf8proc.h"

#ifdef __cplusplus
extern "C" {
#endif
Expand Down Expand Up @@ -53,19 +57,13 @@ value_t fl_string_count(value_t *args, u_int32_t nargs)
return size_wrap(u8_charnum(str+start, stop-start));
}

#if defined(_OS_WINDOWS_)
extern int wcwidth(uint32_t c);
#elif defined(_OS_LINUX_)
extern int wcwidth(wchar_t c);
#endif

value_t fl_string_width(value_t *args, u_int32_t nargs)
{
argcount("string.width", nargs, 1);
if (iscprim(args[0])) {
cprim_t *cp = (cprim_t*)ptr(args[0]);
if (cp_class(cp) == wchartype) {
int w = wcwidth(*(uint32_t*)cp_data(cp));
int w = utf8proc_charwidth(*(uint32_t*)cp_data(cp));
if (w < 0)
return FL_F;
return fixnum(w);
Expand Down
1 change: 0 additions & 1 deletion src/julia.expmap
Original file line number Diff line number Diff line change
Expand Up @@ -29,7 +29,6 @@
rl_clear_input;
save_arg_area_loc;
u8_*;
wcwidth;
uv_*;
add_library_mapping;
utf8proc_*;
Expand Down
Loading