Skip to content

Commit

Permalink
Improve hash calculation to ensure a list of 128.660 are all uniquely…
Browse files Browse the repository at this point in the history
… hasheable

- Grab more bytes if possible when calculating the hash. This helps prevent collisions as well as improves speed
- Move find_cities/1 to after spawning workers. It can be done in the background as it only takes a very short amount of time
- Read less bytes in find_cities/1 (2MB to be precise)
- In-line do_process_line/2 and add_to_state/3
- Do not generate home-grown half-baked City names in PropEr test but use an input file taken from: https://public.opendatasoft.com/explore/dataset/geonames-all-cities-with-a-population-1000
- Save test files generated by PropEr so they can be repeated from a shell for debugging rather than losing them upon shrinking
- Do not shrink at all as it never really leads to anything anyway
- Add eunit test which tries to hash all 128.660 cities and ensures uniqueness
- Add build badge to README
  • Loading branch information
onno-vos-dev committed Feb 4, 2024
1 parent e5b023d commit 7efcb7f
Show file tree
Hide file tree
Showing 6 changed files with 128,754 additions and 86 deletions.
3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
[![Build](https://github.com/onno-vos-dev/1brc/actions/workflows/build.yml/badge.svg?branch=main)](https://github.com/onno-vos-dev/1brc/actions/workflows/build.yml)

# 1 Billion Row Challenge

Erlang implementation of the [One Billion Row Challenge](https://github.com/gunnarmorling/1brc)
Expand All @@ -7,6 +9,7 @@ This is included in the set of Elixir/Erlang implementations discussed here: gun
### Usage
##### Running PropEr test:
```shell
rebar3 as test eunit
rebar3 as test ct
```

Expand Down
71 changes: 42 additions & 29 deletions src/brc.erl
Original file line number Diff line number Diff line change
@@ -1,14 +1,14 @@
-module(brc).

-export([run/1, find_cities/1]).
-export([run/1, find_cities/2]).

-include("hash.hrl").

run([File]) ->
LkupTable = find_cities(atom_to_list(File)),
Workers = brc_workers:spawn_workers(erlang:system_info(logical_processors)),
{Pid, Ref} = erlang:spawn_monitor(fun() -> exit({normal, brc_processor:start(Workers)}) end),
brc_reader:start(File, Pid),
LkupTable = find_cities(atom_to_list(File), 1024*1024*2), %% 2MB should be enough :-)
receive
{'DOWN', Ref, process, Pid, {normal, Result}} ->
[ exit(P, kill) || P <- Workers ],
Expand All @@ -24,41 +24,54 @@ format_output(Map, LkupTable) ->
round_back(IntFloat) ->
list_to_float(io_lib:format("~.1f", [IntFloat / 10])).

find_cities(File) ->
find_cities(File, Size) ->
{ok, FD} = prim_file:open(File, [read, binary, raw, read_ahead]),
{ok, Data} = prim_file:read(FD, 1024*1024*20), %% Read twenty megabytes
CityMeasurements = binary:split(Data, <<"\n">>, [global]),
Cities = lists:usort(match_cities(CityMeasurements, [])),
LkupTable = lists:foldl(fun(City, A) -> A#{storage_key(City) => City} end, #{}, Cities),
case {length(Cities), maps:size(LkupTable)} of
{CL, ML} when CL =:= ML ->
ok;
{CL, ML} ->
throw({{num_cities, CL}, {num_lkuptable, ML}})
end,
LkupTable.
{ok, Data} = prim_file:read(FD, Size),
create_lookup_table(Data, #{}).

match_cities([], Acc) -> Acc;
match_cities([CityMeasurement | Others], Acc) ->
case binary:match(CityMeasurement, <<";">>) of
nomatch -> Acc;
{Pos, _Len} ->
match_cities(Others, [binary:part(CityMeasurement, 0, Pos) | Acc])
end.
create_lookup_table(Bin, State) ->
create_lookup_table(Bin, State, {<<>>, ?INIT}).

storage_key(Bin) ->
storage_key(Bin, ?INIT).
create_lookup_table(<<C:8, $;:8, Rest/binary>> = Bin, State, {Raw, Acc}) ->
Str = binary:part(Bin, 0, 1),
do_create_lookup_table(Rest, State, {<<Raw/binary, Str/binary>>, ?HASH(Acc, C)});
create_lookup_table(<<C:16, $;:8, Rest/binary>> = Bin, State, {Raw, Acc}) ->
Str = binary:part(Bin, 0, 2),
do_create_lookup_table(Rest, State, {<<Raw/binary, Str/binary>>, ?HASH(Acc, C)});
create_lookup_table(<<C:24, $;:8, Rest/binary>> = Bin, State, {Raw, Acc}) ->
Str = binary:part(Bin, 0, 3),
do_create_lookup_table(Rest, State, {<<Raw/binary, Str/binary>>, ?HASH(Acc, C)});
create_lookup_table(<<C:24, Rest/binary>> = Bin, State, {Raw, Acc}) ->
Str = binary:part(Bin, 0, 3),
create_lookup_table(Rest, State, {<<Raw/binary, Str/binary>>, ?HASH(Acc, C)});
create_lookup_table(_, State, _) ->
State.

storage_key(<<>>, Hash) ->
Hash;
storage_key(<<C:8, T/binary>>, Hash) ->
storage_key(T, ?HASH(Hash, C)).
do_create_lookup_table(<<_:40, $\n:8, Rest/binary>>, State, {CityRaw, City}) ->
create_lookup_table(Rest, State#{City => CityRaw}, {<<>>, ?INIT});
do_create_lookup_table(<<_:32, $\n:8, Rest/binary>>, State, {CityRaw, City}) ->
create_lookup_table(Rest, State#{City => CityRaw}, {<<>>, ?INIT});
do_create_lookup_table(<<_:24, $\n:8, Rest/binary>>, State, {CityRaw, City}) ->
create_lookup_table(Rest, State#{City => CityRaw}, {<<>>, ?INIT});
do_create_lookup_table(_, State, {CityRaw, City}) ->
State#{City => CityRaw}.

-ifdef(EUNIT).

-include_lib("eunit/include/eunit.hrl").

special_strings_hash_test() ->
?assertNotEqual(storage_key(<<"JiÔk">>), storage_key(<<"næðl">>)).
all_cities_hasheable_test() ->
{CitiesL, Map} = create_lookup_table_from_all_cities(),
io:format("Cities1: ~p~n", [hd(lists:sort(CitiesL))]),
io:format("Map1: ~p~n", [hd(lists:sort(maps:values(Map)))]),
Keys = maps:keys(Map),
io:format("Smallest hash: ~p Biggest hash: ~p~n", [lists:min(Keys), lists:max(Keys)]),
io:format("Missing: ~p~n", [CitiesL -- maps:values(Map)]),
?assertEqual(length(CitiesL), maps:size(Map)).

create_lookup_table_from_all_cities() ->
{ok, CitiesBin} = file:read_file(code:lib_dir(brc) ++ "/test/" ++ "cities_with_population_1000.txt"),
CitiesL = binary:split(CitiesBin, <<"\n">>, [global]),
{CitiesL, lists:foldl(fun(City, Acc) -> create_lookup_table(<<City/binary, ";">>, Acc, {<<>>, ?INIT}) end, #{}, CitiesL)}.

-endif.
24 changes: 12 additions & 12 deletions src/brc_workers.erl
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
-export([spawn_workers/1]).

-include("hash.hrl").
-compile({inline, [do_process_line/2, add_to_state/3]}).

spawn_workers(N) ->
Options = [link,
Expand Down Expand Up @@ -31,17 +32,16 @@ worker() ->
Parent ! {self(), worker_done, maps:from_list(get())}
end.

process_lines(<<>>, _) -> ok;
process_lines(<<$;:8, Rest/binary>>, City) ->
do_process_line(Rest, City);
process_lines(<<C1:8, $;:8, Rest/binary>>, Acc) ->
do_process_line(Rest, ?HASH(Acc, C1));
process_lines(<<C1:8, C2:8, $;:8, Rest/binary>>, Acc) ->
do_process_line(Rest, ?HASH(?HASH(Acc, C1), C2));
process_lines(<<C1:8, C2:8, C3:8, $;:8, Rest/binary>>, Acc) ->
do_process_line(Rest, ?HASH(?HASH(?HASH(Acc, C1), C2), C3));
process_lines(<<C1:8, C2:8, C3:8, C4:8, Rest/binary>>, Acc) ->
process_lines(Rest, ?HASH(?HASH(?HASH(?HASH(Acc, C1), C2), C3), C4)).
process_lines(<<>>, _) ->
ok;
process_lines(<<C:8, $;:8, Rest/binary>>, Acc) ->
do_process_line(Rest, ?HASH(Acc, C));
process_lines(<<C:16, $;:8, Rest/binary>>, Acc) ->
do_process_line(Rest, ?HASH(Acc, C));
process_lines(<<C:24, $;:8, Rest/binary>>, Acc) ->
do_process_line(Rest, ?HASH(Acc, C));
process_lines(<<C:24, Rest/binary>>, Acc) ->
process_lines(Rest, ?HASH(Acc, C)).

%% Very specialized float-parser for floats with a single fractional
%% digit, and returns the result as an integer * 10.
Expand All @@ -63,4 +63,4 @@ add_to_state(<<Rest/binary>>, City, Measurement) ->
{Min, Max, MeasurementAcc, N} ->
put(City, {min(Min, Measurement), max(Max, Measurement), MeasurementAcc + Measurement, N + 1}),
process_lines(Rest, ?INIT)
end.
end.
12 changes: 8 additions & 4 deletions src/hash.hrl
Original file line number Diff line number Diff line change
@@ -1,16 +1,20 @@
-ifndef(_HASH_HRL_).
-define(_HASH_HRL_, true).


%-define(HASH(Acc, Char), (((Acc band (Char + ?MASK)) + (Char + ?PRIME)) + (((Acc + Char) bxor Char) * (?PRIME band Char) band ?MASK))).

%% This is a modified version of the FNV64a hashing code found at:
%% https://github.com/leostera/erlang-hash/blob/a1b9101189e115b4eabbe941639f3c626614e986/src/hash_fnv.erl#L98
%%
%% The reason is that PropEr kept finding conflicting keys such as for example: <<"JiÔk">> & <<"næðl">>
%% which both produce the same FNV64a Hash.
%% The reason is that PropEr kept finding conflicting keys
%% which both produce the same FNV Hash. Hence, we test that all cities from the test file containing 128.660 cities
%% are hasheable with below hash. See brc.erl eunit test for that.
%%
%% So we'll call this the 1BRC hash :-)
%% So we'll call this just hash :-)
-define(PRIME, 16777619).
-define(INIT, 2166136261).
-define(MASK, 16#FFFFFFFF).
-define(HASH(Acc, Char), ((Char * Char + Char) + (Acc bxor Char) * ?PRIME) band ?MASK).
-define(HASH(Acc, Char), (((Acc bxor Char) * ?PRIME + 1)) band ?MASK).

-endif.
Loading

0 comments on commit 7efcb7f

Please sign in to comment.