diff --git a/LICENSE.txt b/LICENSE.txt index 12628537e39287..70a43675b2a3a0 100644 --- a/LICENSE.txt +++ b/LICENSE.txt @@ -599,3 +599,26 @@ be/src/util/minizip/* : Condition of use and distribution are the same than zli -------------------------------------------------------------------------------- +be/src/util/utf8_check.cpp: MIT license + +MIT License + +Copyright (c) 2019 Yibo Cai + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. diff --git a/be/CMakeLists.txt b/be/CMakeLists.txt index 5a82572c8c8b21..29f2f82398d4c6 100644 --- a/be/CMakeLists.txt +++ b/be/CMakeLists.txt @@ -40,7 +40,8 @@ string(TOUPPER ${CMAKE_BUILD_TYPE} CMAKE_BUILD_TYPE) message(STATUS "Build type is ${CMAKE_BUILD_TYPE}") # set CMAKE_BUILD_TARGET_ARCH -execute_process(COMMAND bash "-c" "lscpu | grep 'Architecture' | awk '{print $2}'" +# use `lscpu | grep 'Architecture' | awk '{print $2}'` only support system which language is en_US.UTF-8 +execute_process(COMMAND bash "-c" "uname -m" OUTPUT_VARIABLE CMAKE_BUILD_TARGET_ARCH OUTPUT_STRIP_TRAILING_WHITESPACE) diff --git a/be/src/exec/parquet_reader.cpp b/be/src/exec/parquet_reader.cpp index 69181dd5975184..2d5518a4af7183 100644 --- a/be/src/exec/parquet_reader.cpp +++ b/be/src/exec/parquet_reader.cpp @@ -460,7 +460,7 @@ ParquetFile::~ParquetFile() { } arrow::Status ParquetFile::Close() { - if (_file) { + if (_file != nullptr) { _file->close(); delete _file; _file = nullptr; @@ -469,7 +469,7 @@ arrow::Status ParquetFile::Close() { } bool ParquetFile::closed() const { - if (_file) { + if (_file != nullptr) { return _file->closed(); } else { return true; @@ -477,28 +477,14 @@ bool ParquetFile::closed() const { } arrow::Status ParquetFile::Read(int64_t nbytes, int64_t* bytes_read, void* buffer) { - bool eof = false; - size_t data_size = 0; - do { - data_size = nbytes; - Status result = _file->read((uint8_t*)buffer, &data_size, &eof); - if (!result.ok()) { - return arrow::Status::IOError("Read failed."); - } - if (eof) { - break; - } - *bytes_read += data_size; // total read bytes - nbytes -= data_size; // remained bytes - buffer = (uint8_t*)buffer + data_size; - } while (nbytes != 0); - return arrow::Status::OK(); + return ReadAt(_pos, nbytes, bytes_read, buffer); } arrow::Status ParquetFile::ReadAt(int64_t position, int64_t nbytes, int64_t* bytes_read, void* out) { int64_t reads = 0; - while(nbytes != 0) { - Status result = _file->readat(position, nbytes, &reads, out); + _pos = position; + while (nbytes > 0) { + Status result = _file->readat(_pos, nbytes, &reads, out); if (!result.ok()) { *bytes_read = 0; return arrow::Status::IOError("Readat failed."); @@ -508,7 +494,7 @@ arrow::Status ParquetFile::ReadAt(int64_t position, int64_t nbytes, int64_t* byt } *bytes_read += reads;// total read bytes nbytes -= reads; // remained bytes - position += reads; + _pos += reads; out = (char*)out + reads; } return arrow::Status::OK(); @@ -520,13 +506,14 @@ arrow::Status ParquetFile::GetSize(int64_t* size) { } arrow::Status ParquetFile::Seek(int64_t position) { - _file->seek(position); + _pos = position; + // NOTE: Only readat operation is used, so _file seek is not called here. return arrow::Status::OK(); } arrow::Status ParquetFile::Tell(int64_t* position) const { - _file->tell(position); + *position = _pos; return arrow::Status::OK(); } diff --git a/be/src/exprs/bitmap_function.cpp b/be/src/exprs/bitmap_function.cpp index 52231b92f87084..9d6b479933a8ff 100644 --- a/be/src/exprs/bitmap_function.cpp +++ b/be/src/exprs/bitmap_function.cpp @@ -388,6 +388,98 @@ BigIntVal BitmapFunctions::bitmap_intersect_finalize(FunctionContext* ctx, const return result; } +StringVal BitmapFunctions::bitmap_or(FunctionContext* ctx, const StringVal& src, const StringVal& dst){ +<<<<<<< HEAD +<<<<<<< HEAD +<<<<<<< HEAD +======= +>>>>>>> 2f990c06... fix bitmap udf + RoaringBitmap bitmap; + if(!src.is_null){ + if(src.len == 0 ){ + bitmap.merge(*reinterpret_cast(src.ptr)); + } else{ + bitmap.merge(RoaringBitmap ((char*)src.ptr)); + } + } + + if(!dst.is_null){ + if(dst.len == 0){ + bitmap.merge(*reinterpret_cast(dst.ptr)); + } else{ + bitmap.merge(RoaringBitmap ((char*)dst.ptr)); + } + } +<<<<<<< HEAD + + StringVal result(ctx,bitmap.size()); + bitmap.serialize((char*)result.ptr); + return result; +} +StringVal BitmapFunctions::bitmap_and(FunctionContext* ctx, const StringVal& src, const StringVal& dst){ + RoaringBitmap bitmap; + if(!src.is_null){ + if(src.len == 0 ){ + bitmap.merge(*reinterpret_cast(src.ptr)); + } else{ + bitmap.merge(RoaringBitmap ((char*)src.ptr)); + } + } + + if(!dst.is_null){ + if(dst.len == 0){ + bitmap.intersect(*reinterpret_cast(dst.ptr)); + } else{ + bitmap.intersect(RoaringBitmap ((char*)dst.ptr)); + } + } + + StringVal result(ctx,bitmap.size()); + bitmap.serialize((char*)result.ptr); +======= +======= +>>>>>>> f1540266... ADD BE BitMap UDF and & or + RoaringBitmap src_bitmap ((char*)src.ptr); + src_bitmap.merge(RoaringBitmap((char*)dst.ptr)); +======= +>>>>>>> 2f990c06... fix bitmap udf + + StringVal result(ctx,bitmap.size()); + bitmap.serialize((char*)result.ptr); + return result; +} +StringVal BitmapFunctions::bitmap_and(FunctionContext* ctx, const StringVal& src, const StringVal& dst){ + RoaringBitmap bitmap; + if(!src.is_null){ + if(src.len == 0 ){ + bitmap.merge(*reinterpret_cast(src.ptr)); + } else{ + bitmap.merge(RoaringBitmap ((char*)src.ptr)); + } + } + + if(!dst.is_null){ + if(dst.len == 0){ + bitmap.intersect(*reinterpret_cast(dst.ptr)); + } else{ + bitmap.intersect(RoaringBitmap ((char*)dst.ptr)); + } + } + +<<<<<<< HEAD + StringVal result(ctx,srcBitmap.size()); + srcBitmap.serialize((char*)result.ptr); +<<<<<<< HEAD +>>>>>>> e1e68309... ADD BE BitMap UDF and & or +======= +>>>>>>> f1540266... ADD BE BitMap UDF and & or +======= + StringVal result(ctx,bitmap.size()); + bitmap.serialize((char*)result.ptr); +>>>>>>> 2f990c06... fix bitmap udf + return result; +} + template void BitmapFunctions::bitmap_update_int( FunctionContext* ctx, const TinyIntVal& src, StringVal* dst); diff --git a/be/src/exprs/bitmap_function.h b/be/src/exprs/bitmap_function.h index 9c313842d216c9..4ea8216b4dc75e 100644 --- a/be/src/exprs/bitmap_function.h +++ b/be/src/exprs/bitmap_function.h @@ -41,6 +41,8 @@ class BitmapFunctions { static StringVal to_bitmap(FunctionContext* ctx, const StringVal& src); static StringVal bitmap_hash(FunctionContext* ctx, const StringVal& src); + static StringVal bitmap_or(FunctionContext* ctx, const StringVal& src,const StringVal& dst); + static StringVal bitmap_and(FunctionContext* ctx, const StringVal& src,const StringVal& dst); // bitmap_intersect template diff --git a/be/src/exprs/hll_function.cpp b/be/src/exprs/hll_function.cpp index 7731c60a4f4e23..e18f1774297473 100644 --- a/be/src/exprs/hll_function.cpp +++ b/be/src/exprs/hll_function.cpp @@ -80,6 +80,12 @@ BigIntVal HllFunctions::hll_finalize(FunctionContext*, const StringVal &src) { return result; } +BigIntVal HllFunctions::hll_get_value(FunctionContext*, const StringVal &src) { + auto* src_hll = reinterpret_cast(src.ptr); + BigIntVal result(src_hll->estimate_cardinality()); + return result; +} + BigIntVal HllFunctions::hll_cardinality(FunctionContext* ctx, const StringVal& input) { if (input.is_null) { return BigIntVal::null(); diff --git a/be/src/exprs/hll_function.h b/be/src/exprs/hll_function.h index 9b2553176291c3..e757204437a067 100644 --- a/be/src/exprs/hll_function.h +++ b/be/src/exprs/hll_function.h @@ -36,6 +36,10 @@ class HllFunctions { static BigIntVal hll_finalize(FunctionContext*, const StringVal& src); + // Get the hll cardinality, the difference from hll_finalize method is + // hll_get_value method doesn't free memory + static BigIntVal hll_get_value(FunctionContext*, const StringVal& src); + static StringVal hll_serialize(FunctionContext* ctx, const StringVal& src); static BigIntVal hll_cardinality(FunctionContext* ctx, const StringVal& src); diff --git a/be/src/gutil/cpu.cc b/be/src/gutil/cpu.cc index c02f5e5949ed43..c50e142c7d3680 100644 --- a/be/src/gutil/cpu.cc +++ b/be/src/gutil/cpu.cc @@ -4,10 +4,14 @@ #include "gutil/cpu.h" -#include -#include +#include +#include -#include "gutil/integral_types.h" +#include +#include +#include +#include +#include #if defined(__x86_64__) #if defined(_MSC_VER) @@ -17,7 +21,39 @@ #endif namespace base { - +#if defined(ARCH_CPU_X86_FAMILY) +namespace internal { +std::tuple ComputeX86FamilyAndModel( + const std::string& vendor, + int signature) { + int family = (signature >> 8) & 0xf; + int model = (signature >> 4) & 0xf; + int ext_family = 0; + int ext_model = 0; + // The "Intel 64 and IA-32 Architectures Developer's Manual: Vol. 2A" + // specifies the Extended Model is defined only when the Base Family is + // 06h or 0Fh. + // The "AMD CPUID Specification" specifies that the Extended Model is + // defined only when Base Family is 0Fh. + // Both manuals define the display model as + // {ExtendedModel[3:0],BaseModel[3:0]} in that case. + if (family == 0xf || (family == 0x6 && vendor == "GenuineIntel")) { + ext_model = (signature >> 16) & 0xf; + model += ext_model << 4; + } + // Both the "Intel 64 and IA-32 Architectures Developer's Manual: Vol. 2A" + // and the "AMD CPUID Specification" specify that the Extended Family is + // defined only when the Base Family is 0Fh. + // Both manuals define the display family as {0000b,BaseFamily[3:0]} + + // ExtendedFamily[7:0] in that case. + if (family == 0xf) { + ext_family = (signature >> 20) & 0xff; + family += ext_family; + } + return {family, model, ext_family, ext_model}; +} +} // namespace internal +#endif // defined(ARCH_CPU_X86_FAMILY) CPU::CPU() : signature_(0), type_(0), @@ -33,61 +69,53 @@ CPU::CPU() has_ssse3_(false), has_sse41_(false), has_sse42_(false), + has_popcnt_(false), has_avx_(false), has_avx2_(false), has_aesni_(false), has_non_stop_time_stamp_counter_(false), - has_broken_neon_(false), + is_running_in_vm_(false), cpu_vendor_("unknown") { Initialize(); } - namespace { - -#if defined(__x86_64__) -#ifndef _MSC_VER - +#if defined(ARCH_CPU_X86_FAMILY) +#if !defined(COMPILER_MSVC) #if defined(__pic__) && defined(__i386__) - void __cpuid(int cpu_info[4], int info_type) { - __asm__ volatile ( - "mov %%ebx, %%edi\n" - "cpuid\n" - "xchg %%edi, %%ebx\n" - : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) - : "a"(info_type) - ); + __asm__ volatile( + "mov %%ebx, %%edi\n" + "cpuid\n" + "xchg %%edi, %%ebx\n" + : "=a"(cpu_info[0]), "=D"(cpu_info[1]), "=c"(cpu_info[2]), + "=d"(cpu_info[3]) + : "a"(info_type), "c"(0)); } - #else - void __cpuid(int cpu_info[4], int info_type) { - __asm__ volatile ( - "cpuid\n" - : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), "=d"(cpu_info[3]) - : "a"(info_type), "c"(0) - ); + __asm__ volatile("cpuid\n" + : "=a"(cpu_info[0]), "=b"(cpu_info[1]), "=c"(cpu_info[2]), + "=d"(cpu_info[3]) + : "a"(info_type), "c"(0)); } - #endif - -// _xgetbv returns the value of an Intel Extended Control Register (XCR). +#endif // !defined(COMPILER_MSVC) +// xgetbv returns the value of an Intel Extended Control Register (XCR). // Currently only XCR0 is defined by Intel so |xcr| should always be zero. -uint64 _xgetbv(uint32 xcr) { - uint32 eax, edx; - +uint64_t xgetbv(uint32_t xcr) { +#if defined(COMPILER_MSVC) + return _xgetbv(xcr); +#else + uint32_t eax, edx; __asm__ volatile ( "xgetbv" : "=a"(eax), "=d"(edx) : "c"(xcr)); - return (static_cast(edx) << 32) | eax; + return (static_cast(edx) << 32) | eax; +#endif // defined(COMPILER_MSVC) } - -#endif // !_MSC_VER -#endif // __x86_64__ - +#endif // ARCH_CPU_X86_FAMILY #if defined(ARCH_CPU_ARM_FAMILY) && (defined(OS_ANDROID) || defined(OS_LINUX)) -class LazyCpuInfoValue { - public: - LazyCpuInfoValue() : has_broken_neon_(false) { +std::string* CpuInfoBrand() { + static std::string* brand = []() { // This function finds the value from /proc/cpuinfo under the key "model // name" or "Processor". "model name" is used in Linux 3.8 and later (3.7 // and later for arm64) and is shown once per CPU. "Processor" is used in @@ -95,112 +123,48 @@ class LazyCpuInfoValue { // regardless of the number CPUs. const char kModelNamePrefix[] = "model name\t: "; const char kProcessorPrefix[] = "Processor\t: "; - - // This function also calculates whether we believe that this CPU has a - // broken NEON unit based on these fields from cpuinfo: - unsigned implementer = 0, architecture = 0, variant = 0, part = 0, - revision = 0; - const struct { - const char key[17]; - unsigned int* result; - } kUnsignedValues[] = { - {"CPU implementer", &implementer}, - {"CPU architecture", &architecture}, - {"CPU variant", &variant}, - {"CPU part", &part}, - {"CPU revision", &revision}, - }; - + std::ifstream info("/proc/cpuinfo"); std::string contents; - ReadFileToString(FilePath("/proc/cpuinfo"), &contents); - DCHECK(!contents.empty()); - if (contents.empty()) { - return; - } - + contents.assign(std::istreambuf_iterator(info), std::istreambuf_iterator()); std::istringstream iss(contents); std::string line; while (std::getline(iss, line)) { - if (brand_.empty() && - (line.compare(0, strlen(kModelNamePrefix), kModelNamePrefix) == 0 || - line.compare(0, strlen(kProcessorPrefix), kProcessorPrefix) == 0)) { - brand_.assign(line.substr(strlen(kModelNamePrefix))); - } - - for (size_t i = 0; i < arraysize(kUnsignedValues); i++) { - const char *key = kUnsignedValues[i].key; - const size_t len = strlen(key); - - if (line.compare(0, len, key) == 0 && - line.size() >= len + 1 && - (line[len] == '\t' || line[len] == ' ' || line[len] == ':')) { - size_t colon_pos = line.find(':', len); - if (colon_pos == std::string::npos) { - continue; - } - - const StringPiece line_sp(line); - StringPiece value_sp = line_sp.substr(colon_pos + 1); - while (!value_sp.empty() && - (value_sp[0] == ' ' || value_sp[0] == '\t')) { - value_sp = value_sp.substr(1); - } - - // The string may have leading "0x" or not, so we use strtoul to - // handle that. - char* endptr; - std::string value(value_sp.as_string()); - unsigned long int result = strtoul(value.c_str(), &endptr, 0); - if (*endptr == 0 && result <= UINT_MAX) { - *kUnsignedValues[i].result = result; - } - } - } + if (line.compare(0, strlen(kModelNamePrefix), kModelNamePrefix) == 0) + return new std::string(line.substr(strlen(kModelNamePrefix))); + if (line.compare(0, strlen(kProcessorPrefix), kProcessorPrefix) == 0) + return new std::string(line.substr(strlen(kProcessorPrefix))); } - - has_broken_neon_ = - implementer == 0x51 && - architecture == 7 && - variant == 1 && - part == 0x4d && - revision == 0; - } - - const std::string& brand() const { return brand_; } - bool has_broken_neon() const { return has_broken_neon_; } - - private: - std::string brand_; - bool has_broken_neon_; - DISALLOW_COPY_AND_ASSIGN(LazyCpuInfoValue); -}; - -base::LazyInstance::Leaky g_lazy_cpuinfo = - LAZY_INSTANCE_INITIALIZER; - + return new std::string(); + }(); + return brand; +} #endif // defined(ARCH_CPU_ARM_FAMILY) && (defined(OS_ANDROID) || // defined(OS_LINUX)) - -} // anonymous namespace - +} // namespace void CPU::Initialize() { -#if defined(__x86_64__) +#if defined(ARCH_CPU_X86_FAMILY) int cpu_info[4] = {-1}; - char cpu_string[48]; - + // This array is used to temporarily hold the vendor name and then the brand + // name. Thus it has to be big enough for both use cases. There are + // static_asserts below for each of the use cases to make sure this array is + // big enough. + char cpu_string[sizeof(cpu_info) * 3 + 1]; // __cpuid with an InfoType argument of 0 returns the number of // valid Ids in CPUInfo[0] and the CPU identification string in // the other three array elements. The CPU identification string is // not in linear order. The code below arranges the information // in a human readable form. The human readable order is CPUInfo[1] | // CPUInfo[3] | CPUInfo[2]. CPUInfo[2] and CPUInfo[3] are swapped - // before using memcpy to copy these three array elements to cpu_string. + // before using memcpy() to copy these three array elements to |cpu_string|. __cpuid(cpu_info, 0); int num_ids = cpu_info[0]; std::swap(cpu_info[2], cpu_info[3]); - memcpy(cpu_string, &cpu_info[1], 3 * sizeof(cpu_info[1])); - cpu_vendor_.assign(cpu_string, 3 * sizeof(cpu_info[1])); - + static constexpr size_t kVendorNameSize = 3 * sizeof(cpu_info[1]); + static_assert(kVendorNameSize < sizeof(cpu_string) / sizeof(cpu_string[0]), + "cpu_string too small"); + memcpy(cpu_string, &cpu_info[1], kVendorNameSize); + cpu_string[kVendorNameSize] = '\0'; + cpu_vendor_ = cpu_string; // Interpret CPU feature information. if (num_ids > 0) { int cpu_info7[4] = {0}; @@ -210,11 +174,9 @@ void CPU::Initialize() { } signature_ = cpu_info[0]; stepping_ = cpu_info[0] & 0xf; - model_ = ((cpu_info[0] >> 4) & 0xf) + ((cpu_info[0] >> 12) & 0xf0); - family_ = (cpu_info[0] >> 8) & 0xf; type_ = (cpu_info[0] >> 12) & 0x3; - ext_model_ = (cpu_info[0] >> 16) & 0xf; - ext_family_ = (cpu_info[0] >> 20) & 0xff; + std::tie(family_, model_, ext_family_, ext_model_) = + internal::ComputeX86FamilyAndModel(cpu_vendor_, signature_); has_mmx_ = (cpu_info[3] & 0x00800000) != 0; has_sse_ = (cpu_info[3] & 0x02000000) != 0; has_sse2_ = (cpu_info[3] & 0x04000000) != 0; @@ -222,6 +184,13 @@ void CPU::Initialize() { has_ssse3_ = (cpu_info[2] & 0x00000200) != 0; has_sse41_ = (cpu_info[2] & 0x00080000) != 0; has_sse42_ = (cpu_info[2] & 0x00100000) != 0; + has_popcnt_ = (cpu_info[2] & 0x00800000) != 0; + // "Hypervisor Present Bit: Bit 31 of ECX of CPUID leaf 0x1." + // See https://lwn.net/Articles/301888/ + // This is checking for any hypervisor. Hypervisors may choose not to + // announce themselves. Hypervisors trap CPUID and sometimes return + // different results to underlying hardware. + is_running_in_vm_ = (cpu_info[2] & 0x80000000) != 0; // AVX instructions will generate an illegal instruction exception unless // a) they are supported by the CPU, // b) XSAVE is supported by the CPU and @@ -236,41 +205,60 @@ void CPU::Initialize() { (cpu_info[2] & 0x10000000) != 0 && (cpu_info[2] & 0x04000000) != 0 /* XSAVE */ && (cpu_info[2] & 0x08000000) != 0 /* OSXSAVE */ && - (_xgetbv(0) & 6) == 6 /* XSAVE enabled by kernel */; + (xgetbv(0) & 6) == 6 /* XSAVE enabled by kernel */; has_aesni_ = (cpu_info[2] & 0x02000000) != 0; has_avx2_ = has_avx_ && (cpu_info7[1] & 0x00000020) != 0; } - // Get the brand string of the cpu. __cpuid(cpu_info, 0x80000000); - const int parameter_end = 0x80000004; - int max_parameter = cpu_info[0]; - - if (cpu_info[0] >= parameter_end) { - char* cpu_string_ptr = cpu_string; - - for (int parameter = 0x80000002; parameter <= parameter_end && - cpu_string_ptr < &cpu_string[sizeof(cpu_string)]; parameter++) { + const int max_parameter = cpu_info[0]; + static constexpr int kParameterStart = 0x80000002; + static constexpr int kParameterEnd = 0x80000004; + static constexpr int kParameterSize = kParameterEnd - kParameterStart + 1; + static_assert(kParameterSize * sizeof(cpu_info) + 1 == sizeof(cpu_string) / sizeof(cpu_string[0]), + "cpu_string has wrong size"); + if (max_parameter >= kParameterEnd) { + size_t i = 0; + for (int parameter = kParameterStart; parameter <= kParameterEnd; + ++parameter) { __cpuid(cpu_info, parameter); - memcpy(cpu_string_ptr, cpu_info, sizeof(cpu_info)); - cpu_string_ptr += sizeof(cpu_info); + memcpy(&cpu_string[i], cpu_info, sizeof(cpu_info)); + i += sizeof(cpu_info); } - cpu_brand_.assign(cpu_string, cpu_string_ptr - cpu_string); + cpu_string[i] = '\0'; + cpu_brand_ = cpu_string; } - - const int parameter_containing_non_stop_time_stamp_counter = 0x80000007; - if (max_parameter >= parameter_containing_non_stop_time_stamp_counter) { - __cpuid(cpu_info, parameter_containing_non_stop_time_stamp_counter); + static constexpr int kParameterContainingNonStopTimeStampCounter = 0x80000007; + if (max_parameter >= kParameterContainingNonStopTimeStampCounter) { + __cpuid(cpu_info, kParameterContainingNonStopTimeStampCounter); has_non_stop_time_stamp_counter_ = (cpu_info[3] & (1 << 8)) != 0; } -#elif defined(ARCH_CPU_ARM_FAMILY) && (defined(OS_ANDROID) || defined(OS_LINUX)) - cpu_brand_.assign(g_lazy_cpuinfo.Get().brand()); - has_broken_neon_ = g_lazy_cpuinfo.Get().has_broken_neon(); -#else - #error unknown architecture + if (!has_non_stop_time_stamp_counter_ && is_running_in_vm_) { + int cpu_info_hv[4] = {}; + __cpuid(cpu_info_hv, 0x40000000); + if (cpu_info_hv[1] == 0x7263694D && // Micr + cpu_info_hv[2] == 0x666F736F && // osof + cpu_info_hv[3] == 0x76482074) { // t Hv + // If CPUID says we have a variant TSC and a hypervisor has identified + // itself and the hypervisor says it is Microsoft Hyper-V, then treat + // TSC as invariant. + // + // Microsoft Hyper-V hypervisor reports variant TSC as there are some + // scenarios (eg. VM live migration) where the TSC is variant, but for + // our purposes we can treat it as invariant. + has_non_stop_time_stamp_counter_ = true; + } + } +#elif defined(ARCH_CPU_ARM_FAMILY) +#if (defined(OS_ANDROID) || defined(OS_LINUX)) + cpu_brand_ = *CpuInfoBrand(); +#elif defined(OS_WIN) + // Windows makes high-resolution thread timing information available in + // user-space. + has_non_stop_time_stamp_counter_ = true; +#endif #endif } - CPU::IntelMicroArchitecture CPU::GetIntelMicroArchitecture() const { if (has_avx2()) return AVX2; if (has_avx()) return AVX; @@ -282,5 +270,4 @@ CPU::IntelMicroArchitecture CPU::GetIntelMicroArchitecture() const { if (has_sse()) return SSE; return PENTIUM; } - } // namespace base diff --git a/be/src/gutil/cpu.h b/be/src/gutil/cpu.h index 65498140d172ba..2115251a42f36d 100644 --- a/be/src/gutil/cpu.h +++ b/be/src/gutil/cpu.h @@ -1,20 +1,57 @@ // Copyright (c) 2012 The Chromium Authors. All rights reserved. // Use of this source code is governed by a BSD-style license that can be // found in the LICENSE file. - #ifndef BASE_CPU_H_ #define BASE_CPU_H_ - #include +#include -namespace base { +#if defined(__APPLE__) +#define OS_MACOSX 1 +#elif defined(__ANDROID__) +#define OS_ANDROID 1 +#elif defined(__linux__) +#define OS_LINUX 1 +#elif defined(_WIN32) +#define OS_WIN 1 +#endif + +#if defined(_M_X64) || defined(__x86_64__) +#define ARCH_CPU_X86_FAMILY 1 +#define ARCH_CPU_X86_64 1 +#define ARCH_CPU_64_BITS 1 +#elif defined(_M_IX86) || defined(__i386__) +#define ARCH_CPU_X86_FAMILY 1 +#define ARCH_CPU_X86 1 +#define ARCH_CPU_32_BITS 1 +#elif defined(__ARMEL__) +#define ARCH_CPU_ARM_FAMILY 1 +#define ARCH_CPU_ARMEL 1 +#define ARCH_CPU_32_BITS 1 +#elif defined(_M_ARM64) || defined(__aarch64__) +#define ARCH_CPU_ARM_FAMILY 1 +#define ARCH_CPU_ARM64 1 +#define ARCH_CPU_64_BITS 1 +#endif +#if defined(OS_MACOSX) || defined(OS_LINUX) || defined(OS_ANDROID) +#define OS_POSIX 1 +#endif + +namespace base { +#if defined(ARCH_CPU_X86_FAMILY) +namespace internal { +// Compute the CPU family and model based on the vendor and CPUID signature. +// Returns in order: family, model, extended family, extended model. +std::tuple ComputeX86FamilyAndModel( + const std::string& vendor, + int signature); +} // namespace internal +#endif // defined(ARCH_CPU_X86_FAMILY) // Query information about the processor. -class CPU { +class CPU final { public: - // Constructor CPU(); - enum IntelMicroArchitecture { PENTIUM, SSE, @@ -27,7 +64,6 @@ class CPU { AVX2, MAX_INTEL_MICRO_ARCHITECTURE }; - // Accessors for CPU information. const std::string& vendor_name() const { return cpu_vendor_; } int signature() const { return signature_; } @@ -44,24 +80,19 @@ class CPU { bool has_ssse3() const { return has_ssse3_; } bool has_sse41() const { return has_sse41_; } bool has_sse42() const { return has_sse42_; } + bool has_popcnt() const { return has_popcnt_; } bool has_avx() const { return has_avx_; } bool has_avx2() const { return has_avx2_; } bool has_aesni() const { return has_aesni_; } bool has_non_stop_time_stamp_counter() const { return has_non_stop_time_stamp_counter_; } - // has_broken_neon is only valid on ARM chips. If true, it indicates that we - // believe that the NEON unit on the current CPU is flawed and cannot execute - // some code. See https://code.google.com/p/chromium/issues/detail?id=341598 - bool has_broken_neon() const { return has_broken_neon_; } - + bool is_running_in_vm() const { return is_running_in_vm_; } IntelMicroArchitecture GetIntelMicroArchitecture() const; const std::string& cpu_brand() const { return cpu_brand_; } - private: // Query the processor for CPUID information. void Initialize(); - int signature_; // raw form of type, family, model, and stepping int type_; // process type int family_; // family of the processor @@ -76,15 +107,14 @@ class CPU { bool has_ssse3_; bool has_sse41_; bool has_sse42_; + bool has_popcnt_; bool has_avx_; bool has_avx2_; bool has_aesni_; bool has_non_stop_time_stamp_counter_; - bool has_broken_neon_; + bool is_running_in_vm_; std::string cpu_vendor_; std::string cpu_brand_; }; - } // namespace base - #endif // BASE_CPU_H_ diff --git a/be/src/gutil/linux_syscall_support.h b/be/src/gutil/linux_syscall_support.h index 5476d0bfa664ef..13aa415e2503eb 100644 --- a/be/src/gutil/linux_syscall_support.h +++ b/be/src/gutil/linux_syscall_support.h @@ -1,3 +1,4 @@ +// -*- Mode: C++; c-basic-offset: 2; indent-tabs-mode: nil -*- /* Copyright (c) 2005-2008, Google Inc. * All rights reserved. * @@ -69,15 +70,74 @@ * This file defines a few internal symbols that all start with "LSS_". * Do not access these symbols from outside this file. They are not part * of the supported API. + * + * NOTE: This is a stripped down version of the official opensource + * version of linux_syscall_support.h, which lives at + * http://code.google.com/p/linux-syscall-support/ + * It includes only the syscalls that are used in perftools, plus a + * few extra. Here's the breakdown: + * 1) Perftools uses these: grep -rho 'sys_[a-z0-9_A-Z]* *(' src | sort -u + * sys__exit( + * sys_clone( + * sys_close( + * sys_fcntl( + * sys_fstat( + * sys_futex( + * sys_getcpu( + * sys_getdents64( + * sys_getppid( + * sys_gettid( + * sys_lseek( + * sys_mmap( + * sys_mremap( + * sys_munmap( + * sys_open( + * sys_pipe( + * sys_prctl( + * sys_ptrace( + * sys_ptrace_detach( + * sys_read( + * sys_sched_yield( + * sys_sigaction( + * sys_sigaltstack( + * sys_sigdelset( + * sys_sigfillset( + * sys_sigprocmask( + * sys_socket( + * sys_stat( + * sys_waitpid( + * 2) These are used as subroutines of the above: + * sys_getpid -- gettid + * sys_kill -- ptrace_detach + * sys_restore -- sigaction + * sys_restore_rt -- sigaction + * sys_socketcall -- socket + * sys_wait4 -- waitpid + * 3) I left these in even though they're not used. They either + * complement the above (write vs read) or are variants (rt_sigaction): + * sys_fstat64 + * sys_llseek + * sys_mmap2 + * sys_openat + * sys_getdents + * sys_rt_sigaction + * sys_rt_sigprocmask + * sys_sigaddset + * sys_sigemptyset + * sys_stat64 + * sys_write */ #ifndef SYS_LINUX_SYSCALL_SUPPORT_H #define SYS_LINUX_SYSCALL_SUPPORT_H -/* We currently only support x86-32, x86-64, ARM, MIPS, and PPC on Linux. +/* We currently only support x86-32, x86-64, ARM, MIPS, PPC/PPC64, Aarch64, s390 and s390x + * on Linux. * Porting to other related platforms should not be difficult. */ #if (defined(__i386__) || defined(__x86_64__) || defined(__arm__) || \ - defined(__mips__) || defined(__PPC__)) && defined(__linux) + defined(__mips__) || defined(__PPC__) || \ + defined(__aarch64__) || defined(__s390__)) \ + && (defined(__linux)) #ifndef SYS_CPLUSPLUS #ifdef __cplusplus @@ -91,6 +151,8 @@ extern "C" { #include #include #include +#include +#include #include #include #include @@ -100,6 +162,7 @@ extern "C" { #include #include #include +#include #ifdef __mips__ /* Include definitions of the ABI currently in use. */ @@ -154,36 +217,6 @@ struct kernel_dirent { char d_name[256]; }; -/* include/linux/uio.h */ -struct kernel_iovec { - void *iov_base; - unsigned long iov_len; -}; - -/* include/linux/socket.h */ -struct kernel_msghdr { - void *msg_name; - int msg_namelen; - struct kernel_iovec*msg_iov; - unsigned long msg_iovlen; - void *msg_control; - unsigned long msg_controllen; - unsigned msg_flags; -}; - -/* include/asm-generic/poll.h */ -struct kernel_pollfd { - int fd; - short events; - short revents; -}; - -/* include/linux/resource.h */ -struct kernel_rlimit { - unsigned long rlim_cur; - unsigned long rlim_max; -}; - /* include/linux/time.h */ struct kernel_timespec { long tv_sec; @@ -216,26 +249,14 @@ struct kernel_rusage { long ru_nivcsw; }; -/* include/linux/capablilty.h */ -struct kernel_cap_user_header { - unsigned int version; - int pid; -}; - -struct kernel_cap_user_data { - unsigned int effective; - unsigned int permitted; - unsigned int inheritable; -}; - -struct siginfo; -#if defined(__i386__) || defined(__arm__) || defined(__PPC__) +#if defined(__i386__) || defined(__arm__) \ + || defined(__PPC__) || (defined(__s390__) && !defined(__s390x__)) /* include/asm-{arm,i386,mips,ppc}/signal.h */ struct kernel_old_sigaction { union { void (*sa_handler_)(int); - void (*sa_sigaction_)(int, struct siginfo *, void *); + void (*sa_sigaction_)(int, siginfo_t *, void *); }; unsigned long sa_mask; unsigned long sa_flags; @@ -243,6 +264,8 @@ struct kernel_old_sigaction { } __attribute__((packed,aligned(4))); #elif (defined(__mips__) && _MIPS_SIM == _MIPS_SIM_ABI32) #define kernel_old_sigaction kernel_sigaction +#elif defined(__aarch64__) + // No kernel_old_sigaction defined for arm64. #endif /* Some kernel functions (e.g. sigaction() in 2.6.23) require that the @@ -266,19 +289,19 @@ struct kernel_sigset_t { (8*sizeof(unsigned long))]; }; -/* include/asm-{arm,i386,mips,x86_64,ppc}/signal.h */ +/* include/asm-{arm,generic,i386,mips,x86_64,ppc}/signal.h */ struct kernel_sigaction { #ifdef __mips__ unsigned long sa_flags; union { void (*sa_handler_)(int); - void (*sa_sigaction_)(int, struct siginfo *, void *); + void (*sa_sigaction_)(int, siginfo_t *, void *); }; struct kernel_sigset_t sa_mask; #else union { void (*sa_handler_)(int); - void (*sa_sigaction_)(int, struct siginfo *, void *); + void (*sa_sigaction_)(int, siginfo_t *, void *); }; unsigned long sa_flags; void (*sa_restorer)(void); @@ -286,15 +309,9 @@ struct kernel_sigaction { #endif }; -/* include/linux/socket.h */ -struct kernel_sockaddr { - unsigned short sa_family; - char sa_data[14]; -}; - -/* include/asm-{arm,i386,mips,ppc}/stat.h */ +/* include/asm-{arm,i386,mips,ppc,s390}/stat.h */ #ifdef __mips__ -#if _MIPS_SIM == _MIPS_SIM_ABI64 +#if (_MIPS_SIM == _MIPS_SIM_ABI64 || _MIPS_SIM == _MIPS_SIM_NABI32) struct kernel_stat { #else struct kernel_stat64 { @@ -319,27 +336,25 @@ struct kernel_stat64 { unsigned __pad2; unsigned long long st_blocks; }; -#elif defined __PPC__ && !defined __PPC64__ +#elif defined __PPC__ struct kernel_stat64 { unsigned long long st_dev; unsigned long long st_ino; - unsigned st_mode; unsigned st_nlink; + unsigned st_mode; unsigned st_uid; unsigned st_gid; + int __pad2; unsigned long long st_rdev; - unsigned short int __pad2; long long st_size; - long st_blksize; + long long st_blksize; long long st_blocks; - long st_atime_; - unsigned long st_atime_nsec_; - long st_mtime_; - unsigned long st_mtime_nsec_; - long st_ctime_; - unsigned long st_ctime_nsec_; + kernel_timespec st_atim; + kernel_timespec st_mtim; + kernel_timespec st_ctim; unsigned long __unused4; unsigned long __unused5; + unsigned long __unused6; }; #else struct kernel_stat64 { @@ -365,7 +380,7 @@ struct kernel_stat64 { }; #endif -/* include/asm-{arm,i386,mips,x86_64,ppc}/stat.h */ +/* include/asm-{arm,generic,i386,mips,x86_64,ppc,s390}/stat.h */ #if defined(__i386__) || defined(__arm__) struct kernel_stat { /* The kernel headers suggest that st_dev and st_rdev should be 32bit @@ -396,55 +411,47 @@ struct kernel_stat { }; #elif defined(__x86_64__) struct kernel_stat { - unsigned long st_dev; - unsigned long st_ino; - unsigned long st_nlink; + uint64_t st_dev; + uint64_t st_ino; + uint64_t st_nlink; unsigned st_mode; unsigned st_uid; unsigned st_gid; unsigned __pad0; - unsigned long st_rdev; - long st_size; - long st_blksize; - long st_blocks; - unsigned long st_atime_; - unsigned long st_atime_nsec_; - unsigned long st_mtime_; - unsigned long st_mtime_nsec_; - unsigned long st_ctime_; - unsigned long st_ctime_nsec_; - long __unused[3]; + uint64_t st_rdev; + int64_t st_size; + int64_t st_blksize; + int64_t st_blocks; + uint64_t st_atime_; + uint64_t st_atime_nsec_; + uint64_t st_mtime_; + uint64_t st_mtime_nsec_; + uint64_t st_ctime_; + uint64_t st_ctime_nsec_; + int64_t __unused[3]; }; #elif defined(__PPC__) struct kernel_stat { - unsigned long st_dev; - unsigned long st_ino; // ino_t -#ifdef __PPC64__ - unsigned long st_nlink; // nlink_t - unsigned int st_mode; // mode_t -#else - unsigned int st_mode; // mode_t - unsigned short st_nlink; // nlink_t -#endif - unsigned int st_uid; // uid_t - unsigned int st_gid; // gid_t - unsigned long st_rdev; - long st_size; // off_t + unsigned long long st_dev; + unsigned long st_ino; + unsigned long st_nlink; + unsigned long st_mode; + unsigned st_uid; + unsigned st_gid; + int __pad2; + unsigned long long st_rdev; + long st_size; unsigned long st_blksize; unsigned long st_blocks; - unsigned long st_atime_; - unsigned long st_atime_nsec_; - unsigned long st_mtime_; - unsigned long st_mtime_nsec_; - unsigned long st_ctime_; - unsigned long st_ctime_nsec_; + kernel_timespec st_atim; + kernel_timespec st_mtim; + kernel_timespec st_ctim; unsigned long __unused4; unsigned long __unused5; -#ifdef __PPC64__ unsigned long __unused6; -#endif }; -#elif (defined(__mips__) && _MIPS_SIM != _MIPS_SIM_ABI64) +#elif defined(__mips__) \ + && !(_MIPS_SIM == _MIPS_SIM_ABI64 || _MIPS_SIM == _MIPS_SIM_NABI32) struct kernel_stat { unsigned st_dev; int st_pad1[3]; @@ -467,145 +474,90 @@ struct kernel_stat { int st_blocks; int st_pad4[14]; }; -#endif - -/* include/asm-{arm,i386,mips,x86_64,ppc}/statfs.h */ -#ifdef __mips__ -#if _MIPS_SIM != _MIPS_SIM_ABI64 -struct kernel_statfs64 { - unsigned long f_type; - unsigned long f_bsize; - unsigned long f_frsize; - unsigned long __pad; - unsigned long long f_blocks; - unsigned long long f_bfree; - unsigned long long f_files; - unsigned long long f_ffree; - unsigned long long f_bavail; - struct { int val[2]; } f_fsid; - unsigned long f_namelen; - unsigned long f_spare[6]; -}; -#endif -#elif !defined(__x86_64__) -struct kernel_statfs64 { - unsigned long f_type; - unsigned long f_bsize; - unsigned long long f_blocks; - unsigned long long f_bfree; - unsigned long long f_bavail; - unsigned long long f_files; - unsigned long long f_ffree; - struct { int val[2]; } f_fsid; - unsigned long f_namelen; - unsigned long f_frsize; - unsigned long f_spare[5]; -}; -#endif - -/* include/asm-{arm,i386,mips,x86_64,ppc,generic}/statfs.h */ -#ifdef __mips__ -struct kernel_statfs { - long f_type; - long f_bsize; - long f_frsize; - long f_blocks; - long f_bfree; - long f_files; - long f_ffree; - long f_bavail; - struct { int val[2]; } f_fsid; - long f_namelen; - long f_spare[6]; -}; -#else -struct kernel_statfs { - /* x86_64 actually defines all these fields as signed, whereas all other */ - /* platforms define them as unsigned. Leaving them at unsigned should not */ - /* cause any problems. */ - unsigned long f_type; - unsigned long f_bsize; - unsigned long f_blocks; - unsigned long f_bfree; - unsigned long f_bavail; - unsigned long f_files; - unsigned long f_ffree; - struct { int val[2]; } f_fsid; - unsigned long f_namelen; - unsigned long f_frsize; - unsigned long f_spare[5]; +#elif defined(__aarch64__) +struct kernel_stat { + unsigned long st_dev; + unsigned long st_ino; + unsigned int st_mode; + unsigned int st_nlink; + unsigned int st_uid; + unsigned int st_gid; + unsigned long st_rdev; + unsigned long __pad1; + long st_size; + int st_blksize; + int __pad2; + long st_blocks; + long st_atime_; + unsigned long st_atime_nsec_; + long st_mtime_; + unsigned long st_mtime_nsec_; + long st_ctime_; + unsigned long st_ctime_nsec_; + unsigned int __unused4; + unsigned int __unused5; }; -#endif - -#if defined(__i386__) || defined(__x86_64__) || defined(__arm__) || \ - defined(__PPC__) -/* include/linux/aio_abi.h */ -/* Layout depends on big/little endian. */ -struct kernel_iocb { - unsigned long long aio_data; - unsigned int aio_key; - unsigned int aio_reserved; - unsigned short aio_lio_opcode; - short aio_reqprio; - unsigned int aio_filedes; - unsigned long long aio_buf; - unsigned long long aio_nbytes; - unsigned long long aio_offset; - unsigned long long aio_reserved2; - unsigned int aio_flags; - unsigned int aio_resfd; +#elif defined(__s390x__) +struct kernel_stat { + unsigned long st_dev; + unsigned long st_ino; + unsigned long st_nlink; + unsigned int st_mode; + unsigned int st_uid; + unsigned int st_gid; + unsigned int __pad1; + unsigned long st_rdev; + unsigned long st_size; + unsigned long st_atime_; + unsigned long st_atime_nsec_; + unsigned long st_mtime_; + unsigned long st_mtime_nsec_; + unsigned long st_ctime_; + unsigned long st_ctime_nsec_; + unsigned long st_blksize; + long st_blocks; + unsigned long __unused[3]; }; -#elif defined(__PPC__) -struct kernel_iocb { - unsigned long long aio_data; - unsigned int aio_reserved; - unsigned int aio_key; - unsigned short aio_lio_opcode; - short aio_reqprio; - unsigned int aio_fildes; - unsigned long long aio_buf; - unsigned long long aio_nbytes; - unsigned long long aio_offset; - unsigned long long aio_reserved2; - unsigned int aio_flags; - unsigned int aio_resfd; +#elif defined(__s390__) +struct kernel_stat { + unsigned short st_dev; + unsigned short __pad1; + unsigned long st_ino; + unsigned short st_mode; + unsigned short st_nlink; + unsigned short st_uid; + unsigned short st_gid; + unsigned short st_rdev; + unsigned short __pad2; + unsigned long st_size; + unsigned long st_blksize; + unsigned long st_blocks; + unsigned long st_atime_; + unsigned long st_atime_nsec_; + unsigned long st_mtime_; + unsigned long st_mtime_nsec_; + unsigned long st_ctime_; + unsigned long st_ctime_nsec_; + unsigned long __unused4; + unsigned long __unused5; }; #endif -/* include/linux/aio_abi.h */ -struct kernel_io_event { - unsigned long long data; - unsigned long long obj; - long long res; - long long res2; -}; /* Definitions missing from the standard header files */ #ifndef O_DIRECTORY -#if defined(__arm__) || defined(__PPC_) +#if defined(__arm__) #define O_DIRECTORY 0040000 #else #define O_DIRECTORY 0200000 #endif #endif -#ifndef NT_PRXFPREG -#define NT_PRXFPREG 0x46e62b7f -#endif -#ifndef PTRACE_GETFPXREGS -#define PTRACE_GETFPXREGS ((enum __ptrace_request)18) -#endif #ifndef PR_GET_DUMPABLE #define PR_GET_DUMPABLE 3 #endif #ifndef PR_SET_DUMPABLE #define PR_SET_DUMPABLE 4 #endif -#ifndef PR_GET_SECCOMP -#define PR_GET_SECCOMP 21 -#endif -#ifndef PR_SET_SECCOMP -#define PR_SET_SECCOMP 22 -#endif #ifndef AT_FDCWD #define AT_FDCWD (-100) #endif @@ -621,126 +573,11 @@ struct kernel_io_event { #ifndef SA_RESTORER #define SA_RESTORER 0x04000000 #endif -#ifndef CPUCLOCK_PROF -#define CPUCLOCK_PROF 0 -#endif -#ifndef CPUCLOCK_VIRT -#define CPUCLOCK_VIRT 1 -#endif -#ifndef CPUCLOCK_SCHED -#define CPUCLOCK_SCHED 2 -#endif -#ifndef CPUCLOCK_PERTHREAD_MASK -#define CPUCLOCK_PERTHREAD_MASK 4 -#endif -#ifndef MAKE_PROCESS_CPUCLOCK -#define MAKE_PROCESS_CPUCLOCK(pid, clock) \ - ((~(int)(pid) << 3) | (int)(clock)) -#endif -#ifndef MAKE_THREAD_CPUCLOCK -#define MAKE_THREAD_CPUCLOCK(tid, clock) \ - ((~(int)(tid) << 3) | (int)((clock) | CPUCLOCK_PERTHREAD_MASK)) -#endif - -#ifndef FUTEX_WAIT -#define FUTEX_WAIT 0 -#endif -#ifndef FUTEX_WAKE -#define FUTEX_WAKE 1 -#endif -#ifndef FUTEX_FD -#define FUTEX_FD 2 -#endif -#ifndef FUTEX_REQUEUE -#define FUTEX_REQUEUE 3 -#endif -#ifndef FUTEX_CMP_REQUEUE -#define FUTEX_CMP_REQUEUE 4 -#endif -#ifndef FUTEX_WAKE_OP -#define FUTEX_WAKE_OP 5 -#endif -#ifndef FUTEX_LOCK_PI -#define FUTEX_LOCK_PI 6 -#endif -#ifndef FUTEX_UNLOCK_PI -#define FUTEX_UNLOCK_PI 7 -#endif -#ifndef FUTEX_TRYLOCK_PI -#define FUTEX_TRYLOCK_PI 8 -#endif -#ifndef FUTEX_PRIVATE_FLAG -#define FUTEX_PRIVATE_FLAG 128 -#endif -#ifndef FUTEX_CMD_MASK -#define FUTEX_CMD_MASK ~FUTEX_PRIVATE_FLAG -#endif -#ifndef FUTEX_WAIT_PRIVATE -#define FUTEX_WAIT_PRIVATE (FUTEX_WAIT | FUTEX_PRIVATE_FLAG) -#endif -#ifndef FUTEX_WAKE_PRIVATE -#define FUTEX_WAKE_PRIVATE (FUTEX_WAKE | FUTEX_PRIVATE_FLAG) -#endif -#ifndef FUTEX_REQUEUE_PRIVATE -#define FUTEX_REQUEUE_PRIVATE (FUTEX_REQUEUE | FUTEX_PRIVATE_FLAG) -#endif -#ifndef FUTEX_CMP_REQUEUE_PRIVATE -#define FUTEX_CMP_REQUEUE_PRIVATE (FUTEX_CMP_REQUEUE | FUTEX_PRIVATE_FLAG) -#endif -#ifndef FUTEX_WAKE_OP_PRIVATE -#define FUTEX_WAKE_OP_PRIVATE (FUTEX_WAKE_OP | FUTEX_PRIVATE_FLAG) -#endif -#ifndef FUTEX_LOCK_PI_PRIVATE -#define FUTEX_LOCK_PI_PRIVATE (FUTEX_LOCK_PI | FUTEX_PRIVATE_FLAG) -#endif -#ifndef FUTEX_UNLOCK_PI_PRIVATE -#define FUTEX_UNLOCK_PI_PRIVATE (FUTEX_UNLOCK_PI | FUTEX_PRIVATE_FLAG) -#endif -#ifndef FUTEX_TRYLOCK_PI_PRIVATE -#define FUTEX_TRYLOCK_PI_PRIVATE (FUTEX_TRYLOCK_PI | FUTEX_PRIVATE_FLAG) -#endif - - -#if defined(__x86_64__) -#ifndef ARCH_SET_GS -#define ARCH_SET_GS 0x1001 -#endif -#ifndef ARCH_GET_GS -#define ARCH_GET_GS 0x1004 -#endif -#endif #if defined(__i386__) -#ifndef __NR_mount -#define __NR_mount 21 -#endif -#ifndef __NR_setgroups32 -#define __NR_setgroups32 81 -#endif -#ifndef __NR_quotactl -#define __NR_quotactl 131 -#endif -#ifndef __NR_setresuid -#define __NR_setresuid 164 -#define __NR_getresuid 165 -#define __NR_setresgid 170 -#define __NR_getresgid 171 -#endif #ifndef __NR_rt_sigaction -#define __NR_rt_sigreturn 173 #define __NR_rt_sigaction 174 #define __NR_rt_sigprocmask 175 -#define __NR_rt_sigpending 176 -#define __NR_rt_sigsuspend 179 -#endif -#ifndef __NR_pread64 -#define __NR_pread64 180 -#endif -#ifndef __NR_pwrite64 -#define __NR_pwrite64 181 -#endif -#ifndef __NR_ugetrlimit -#define __NR_ugetrlimit 191 #endif #ifndef __NR_stat64 #define __NR_stat64 195 @@ -748,114 +585,21 @@ struct kernel_io_event { #ifndef __NR_fstat64 #define __NR_fstat64 197 #endif -#ifndef __NR_setresuid32 -#define __NR_setresuid32 208 -#define __NR_getresuid32 209 -#define __NR_setresgid32 210 -#define __NR_getresgid32 211 -#endif -#ifndef __NR_setfsuid32 -#define __NR_setfsuid32 215 -#define __NR_setfsgid32 216 -#endif #ifndef __NR_getdents64 #define __NR_getdents64 220 #endif #ifndef __NR_gettid #define __NR_gettid 224 #endif -#ifndef __NR_readahead -#define __NR_readahead 225 -#endif -#ifndef __NR_setxattr -#define __NR_setxattr 226 -#endif -#ifndef __NR_lsetxattr -#define __NR_lsetxattr 227 -#endif -#ifndef __NR_getxattr -#define __NR_getxattr 229 -#endif -#ifndef __NR_lgetxattr -#define __NR_lgetxattr 230 -#endif -#ifndef __NR_listxattr -#define __NR_listxattr 232 -#endif -#ifndef __NR_llistxattr -#define __NR_llistxattr 233 -#endif -#ifndef __NR_tkill -#define __NR_tkill 238 -#endif #ifndef __NR_futex #define __NR_futex 240 #endif -#ifndef __NR_sched_setaffinity -#define __NR_sched_setaffinity 241 -#define __NR_sched_getaffinity 242 -#endif -#ifndef __NR_io_setup -#define __NR_io_setup 245 -#define __NR_io_destroy 246 -#define __NR_io_getevents 247 -#define __NR_io_submit 248 -#define __NR_io_cancel 249 -#endif -#ifndef __NR_set_tid_address -#define __NR_set_tid_address 258 -#endif -#ifndef __NR_clock_gettime -#define __NR_clock_gettime 265 -#endif -#ifndef __NR_clock_getres -#define __NR_clock_getres 266 -#endif -#ifndef __NR_statfs64 -#define __NR_statfs64 268 -#endif -#ifndef __NR_fstatfs64 -#define __NR_fstatfs64 269 -#endif -#ifndef __NR_fadvise64_64 -#define __NR_fadvise64_64 272 -#endif -#ifndef __NR_ioprio_set -#define __NR_ioprio_set 289 -#endif -#ifndef __NR_ioprio_get -#define __NR_ioprio_get 290 -#endif #ifndef __NR_openat #define __NR_openat 295 #endif -#ifndef __NR_fstatat64 -#define __NR_fstatat64 300 -#endif -#ifndef __NR_unlinkat -#define __NR_unlinkat 301 -#endif -#ifndef __NR_unshare -#define __NR_unshare 310 -#endif -#ifndef __NR_move_pages -#define __NR_move_pages 317 -#endif #ifndef __NR_getcpu #define __NR_getcpu 318 #endif -#ifndef __NR_fallocate -#define __NR_fallocate 324 -#endif -#ifndef __NR_preadv -#define __NR_preadv 333 -#endif -#ifndef __NR_pwritev -#define __NR_pwritev 334 -#endif -#ifndef __NR_setns -#define __NR_setns 346 -#endif /* End of i386 definitions */ #elif defined(__arm__) #ifndef __syscall @@ -875,33 +619,9 @@ struct kernel_io_event { #define __syscall_safe(name) __syscall(name) #endif #endif -#ifndef __NR_mount -#define __NR_mount (__NR_SYSCALL_BASE + 21) -#endif -#ifndef __NR_setresuid -#define __NR_setresuid (__NR_SYSCALL_BASE + 164) -#define __NR_getresuid (__NR_SYSCALL_BASE + 165) -#define __NR_setresgid (__NR_SYSCALL_BASE + 170) -#define __NR_getresgid (__NR_SYSCALL_BASE + 171) -#endif #ifndef __NR_rt_sigaction -#define __NR_rt_sigreturn (__NR_SYSCALL_BASE + 173) #define __NR_rt_sigaction (__NR_SYSCALL_BASE + 174) #define __NR_rt_sigprocmask (__NR_SYSCALL_BASE + 175) -#define __NR_rt_sigpending (__NR_SYSCALL_BASE + 176) -#define __NR_rt_sigsuspend (__NR_SYSCALL_BASE + 179) -#endif -#ifndef __NR_pread64 -#define __NR_pread64 (__NR_SYSCALL_BASE + 180) -#endif -#ifndef __NR_pwrite64 -#define __NR_pwrite64 (__NR_SYSCALL_BASE + 181) -#endif -#ifndef __NR_capset -#define __NR_capset (__NR_SYSCALL_BASE + 185) -#endif -#ifndef __NR_ugetrlimit -#define __NR_ugetrlimit (__NR_SYSCALL_BASE + 191) #endif #ifndef __NR_stat64 #define __NR_stat64 (__NR_SYSCALL_BASE + 195) @@ -909,228 +629,41 @@ struct kernel_io_event { #ifndef __NR_fstat64 #define __NR_fstat64 (__NR_SYSCALL_BASE + 197) #endif -#ifndef __NR_setgroups32 -#define __NR_setgroups32 (__NR_SYSCALL_BASE + 206) -#endif -#ifndef __NR_setresuid32 -#define __NR_setresuid32 (__NR_SYSCALL_BASE + 208) -#define __NR_getresuid32 (__NR_SYSCALL_BASE + 209) -#define __NR_setresgid32 (__NR_SYSCALL_BASE + 210) -#define __NR_getresgid32 (__NR_SYSCALL_BASE + 211) -#endif -#ifndef __NR_setfsuid32 -#define __NR_setfsuid32 (__NR_SYSCALL_BASE + 215) -#define __NR_setfsgid32 (__NR_SYSCALL_BASE + 216) -#endif #ifndef __NR_getdents64 #define __NR_getdents64 (__NR_SYSCALL_BASE + 217) #endif #ifndef __NR_gettid #define __NR_gettid (__NR_SYSCALL_BASE + 224) #endif -#ifndef __NR_readahead -#define __NR_readahead (__NR_SYSCALL_BASE + 225) -#endif -#ifndef __NR_setxattr -#define __NR_setxattr (__NR_SYSCALL_BASE + 226) +#ifndef __NR_futex +#define __NR_futex (__NR_SYSCALL_BASE + 240) #endif -#ifndef __NR_lsetxattr -#define __NR_lsetxattr (__NR_SYSCALL_BASE + 227) +/* End of ARM definitions */ +#elif defined(__x86_64__) +#ifndef __NR_gettid +#define __NR_gettid 186 #endif -#ifndef __NR_getxattr -#define __NR_getxattr (__NR_SYSCALL_BASE + 229) +#ifndef __NR_futex +#define __NR_futex 202 #endif -#ifndef __NR_lgetxattr -#define __NR_lgetxattr (__NR_SYSCALL_BASE + 230) +#ifndef __NR_getdents64 +#define __NR_getdents64 217 #endif -#ifndef __NR_listxattr -#define __NR_listxattr (__NR_SYSCALL_BASE + 232) +#ifndef __NR_openat +#define __NR_openat 257 #endif -#ifndef __NR_llistxattr -#define __NR_llistxattr (__NR_SYSCALL_BASE + 233) +/* End of x86-64 definitions */ +#elif defined(__mips__) +#if _MIPS_SIM == _MIPS_SIM_ABI32 +#ifndef __NR_rt_sigaction +#define __NR_rt_sigaction (__NR_Linux + 194) +#define __NR_rt_sigprocmask (__NR_Linux + 195) #endif -#ifndef __NR_tkill -#define __NR_tkill (__NR_SYSCALL_BASE + 238) +#ifndef __NR_stat64 +#define __NR_stat64 (__NR_Linux + 213) #endif -#ifndef __NR_futex -#define __NR_futex (__NR_SYSCALL_BASE + 240) -#endif -#ifndef __NR_sched_setaffinity -#define __NR_sched_setaffinity (__NR_SYSCALL_BASE + 241) -#define __NR_sched_getaffinity (__NR_SYSCALL_BASE + 242) -#endif -#ifndef __NR_set_tid_address -#define __NR_set_tid_address (__NR_SYSCALL_BASE + 256) -#endif -#ifndef __NR_clock_gettime -#define __NR_clock_gettime (__NR_SYSCALL_BASE + 263) -#endif -#ifndef __NR_clock_getres -#define __NR_clock_getres (__NR_SYSCALL_BASE + 264) -#endif -#ifndef __NR_statfs64 -#define __NR_statfs64 (__NR_SYSCALL_BASE + 266) -#endif -#ifndef __NR_fstatfs64 -#define __NR_fstatfs64 (__NR_SYSCALL_BASE + 267) -#endif -#ifndef __NR_ioprio_set -#define __NR_ioprio_set (__NR_SYSCALL_BASE + 314) -#endif -#ifndef __NR_ioprio_get -#define __NR_ioprio_get (__NR_SYSCALL_BASE + 315) -#endif -#ifndef __NR_unshare -#define __NR_unshare (__NR_SYSCALL_BASE + 337) -#endif -#ifndef __NR_move_pages -#define __NR_move_pages (__NR_SYSCALL_BASE + 344) -#endif -#ifndef __NR_setns -#define __NR_setns (__NR_SYSCALL_BASE + 375) -#endif -/* End of ARM definitions */ -#elif defined(__x86_64__) -#ifndef __NR_pread64 -#define __NR_pread64 17 -#endif -#ifndef __NR_pwrite64 -#define __NR_pwrite64 18 -#endif -#ifndef __NR_setresuid -#define __NR_setresuid 117 -#define __NR_getresuid 118 -#define __NR_setresgid 119 -#define __NR_getresgid 120 -#endif -#ifndef __NR_mount -#define __NR_mount 165 -#endif -#ifndef __NR_quotactl -#define __NR_quotactl 179 -#endif -#ifndef __NR_gettid -#define __NR_gettid 186 -#endif -#ifndef __NR_readahead -#define __NR_readahead 187 -#endif -#ifndef __NR_setxattr -#define __NR_setxattr 188 -#endif -#ifndef __NR_lsetxattr -#define __NR_lsetxattr 189 -#endif -#ifndef __NR_getxattr -#define __NR_getxattr 191 -#endif -#ifndef __NR_lgetxattr -#define __NR_lgetxattr 192 -#endif -#ifndef __NR_listxattr -#define __NR_listxattr 194 -#endif -#ifndef __NR_llistxattr -#define __NR_llistxattr 195 -#endif -#ifndef __NR_tkill -#define __NR_tkill 200 -#endif -#ifndef __NR_futex -#define __NR_futex 202 -#endif -#ifndef __NR_sched_setaffinity -#define __NR_sched_setaffinity 203 -#define __NR_sched_getaffinity 204 -#endif -#ifndef __NR_io_setup -#define __NR_io_setup 206 -#define __NR_io_destroy 207 -#define __NR_io_getevents 208 -#define __NR_io_submit 209 -#define __NR_io_cancel 210 -#endif -#ifndef __NR_getdents64 -#define __NR_getdents64 217 -#endif -#ifndef __NR_set_tid_address -#define __NR_set_tid_address 218 -#endif -#ifndef __NR_fadvise64 -#define __NR_fadvise64 221 -#endif -#ifndef __NR_clock_gettime -#define __NR_clock_gettime 228 -#endif -#ifndef __NR_clock_getres -#define __NR_clock_getres 229 -#endif -#ifndef __NR_ioprio_set -#define __NR_ioprio_set 251 -#endif -#ifndef __NR_ioprio_get -#define __NR_ioprio_get 252 -#endif -#ifndef __NR_openat -#define __NR_openat 257 -#endif -#ifndef __NR_newfstatat -#define __NR_newfstatat 262 -#endif -#ifndef __NR_unlinkat -#define __NR_unlinkat 263 -#endif -#ifndef __NR_unshare -#define __NR_unshare 272 -#endif -#ifndef __NR_move_pages -#define __NR_move_pages 279 -#endif -#ifndef __NR_fallocate -#define __NR_fallocate 285 -#endif -#ifndef __NR_preadv -#define __NR_preadv 295 -#endif -#ifndef __NR_pwritev -#define __NR_pwritev 296 -#endif -#ifndef __NR_setns -#define __NR_setns 308 -#endif -/* End of x86-64 definitions */ -#elif defined(__mips__) -#if _MIPS_SIM == _MIPS_SIM_ABI32 -#ifndef __NR_mount -#define __NR_mount (__NR_Linux + 21) -#endif -#ifndef __NR_setresuid -#define __NR_setresuid (__NR_Linux + 185) -#define __NR_getresuid (__NR_Linux + 186) -#define __NR_setresgid (__NR_Linux + 190) -#define __NR_getresgid (__NR_Linux + 191) -#endif -#ifndef __NR_rt_sigaction -#define __NR_rt_sigreturn (__NR_Linux + 193) -#define __NR_rt_sigaction (__NR_Linux + 194) -#define __NR_rt_sigprocmask (__NR_Linux + 195) -#define __NR_rt_sigpending (__NR_Linux + 196) -#define __NR_rt_sigsuspend (__NR_Linux + 199) -#endif -#ifndef __NR_pread64 -#define __NR_pread64 (__NR_Linux + 200) -#endif -#ifndef __NR_pwrite64 -#define __NR_pwrite64 (__NR_Linux + 201) -#endif -#ifndef __NR_capset -#define __NR_capset (__NR_Linux + 205) -#endif -#ifndef __NR_stat64 -#define __NR_stat64 (__NR_Linux + 213) -#endif -#ifndef __NR_fstat64 -#define __NR_fstat64 (__NR_Linux + 215) +#ifndef __NR_fstat64 +#define __NR_fstat64 (__NR_Linux + 215) #endif #ifndef __NR_getdents64 #define __NR_getdents64 (__NR_Linux + 219) @@ -1138,390 +671,273 @@ struct kernel_io_event { #ifndef __NR_gettid #define __NR_gettid (__NR_Linux + 222) #endif -#ifndef __NR_readahead -#define __NR_readahead (__NR_Linux + 223) -#endif -#ifndef __NR_setxattr -#define __NR_setxattr (__NR_Linux + 224) -#endif -#ifndef __NR_lsetxattr -#define __NR_lsetxattr (__NR_Linux + 225) -#endif -#ifndef __NR_getxattr -#define __NR_getxattr (__NR_Linux + 227) -#endif -#ifndef __NR_lgetxattr -#define __NR_lgetxattr (__NR_Linux + 228) -#endif -#ifndef __NR_listxattr -#define __NR_listxattr (__NR_Linux + 230) -#endif -#ifndef __NR_llistxattr -#define __NR_llistxattr (__NR_Linux + 231) -#endif -#ifndef __NR_tkill -#define __NR_tkill (__NR_Linux + 236) -#endif #ifndef __NR_futex #define __NR_futex (__NR_Linux + 238) #endif -#ifndef __NR_sched_setaffinity -#define __NR_sched_setaffinity (__NR_Linux + 239) -#define __NR_sched_getaffinity (__NR_Linux + 240) -#endif -#ifndef __NR_set_tid_address -#define __NR_set_tid_address (__NR_Linux + 252) -#endif -#ifndef __NR_statfs64 -#define __NR_statfs64 (__NR_Linux + 255) -#endif -#ifndef __NR_fstatfs64 -#define __NR_fstatfs64 (__NR_Linux + 256) -#endif -#ifndef __NR_clock_gettime -#define __NR_clock_gettime (__NR_Linux + 263) -#endif -#ifndef __NR_clock_getres -#define __NR_clock_getres (__NR_Linux + 264) -#endif #ifndef __NR_openat #define __NR_openat (__NR_Linux + 288) #endif #ifndef __NR_fstatat #define __NR_fstatat (__NR_Linux + 293) #endif -#ifndef __NR_unlinkat -#define __NR_unlinkat (__NR_Linux + 294) -#endif -#ifndef __NR_unshare -#define __NR_unshare (__NR_Linux + 303) -#endif -#ifndef __NR_move_pages -#define __NR_move_pages (__NR_Linux + 308) -#endif #ifndef __NR_getcpu #define __NR_getcpu (__NR_Linux + 312) #endif -#ifndef __NR_ioprio_set -#define __NR_ioprio_set (__NR_Linux + 314) -#endif -#ifndef __NR_ioprio_get -#define __NR_ioprio_get (__NR_Linux + 315) -#endif -#ifndef __NR_setns -#define __NR_setns (__NR_Linux + 344) -#endif /* End of MIPS (old 32bit API) definitions */ -#elif _MIPS_SIM == _MIPS_SIM_ABI64 -#ifndef __NR_pread64 -#define __NR_pread64 (__NR_Linux + 16) -#endif -#ifndef __NR_pwrite64 -#define __NR_pwrite64 (__NR_Linux + 17) -#endif -#ifndef __NR_setresuid -#define __NR_setresuid (__NR_Linux + 115) -#define __NR_getresuid (__NR_Linux + 116) -#define __NR_setresgid (__NR_Linux + 117) -#define __NR_getresgid (__NR_Linux + 118) -#endif -#ifndef __NR_capset -#define __NR_capset (__NR_Linux + 124) -#endif -#ifndef __NR_mount -#define __NR_mount (__NR_Linux + 160) -#endif +#elif (_MIPS_SIM == _MIPS_SIM_ABI64 || _MIPS_SIM == _MIPS_SIM_NABI32) #ifndef __NR_gettid #define __NR_gettid (__NR_Linux + 178) #endif -#ifndef __NR_readahead -#define __NR_readahead (__NR_Linux + 179) -#endif -#ifndef __NR_setxattr -#define __NR_setxattr (__NR_Linux + 180) -#endif -#ifndef __NR_lsetxattr -#define __NR_lsetxattr (__NR_Linux + 181) -#endif -#ifndef __NR_getxattr -#define __NR_getxattr (__NR_Linux + 183) -#endif -#ifndef __NR_lgetxattr -#define __NR_lgetxattr (__NR_Linux + 184) -#endif -#ifndef __NR_listxattr -#define __NR_listxattr (__NR_Linux + 186) -#endif -#ifndef __NR_llistxattr -#define __NR_llistxattr (__NR_Linux + 187) -#endif -#ifndef __NR_tkill -#define __NR_tkill (__NR_Linux + 192) -#endif #ifndef __NR_futex #define __NR_futex (__NR_Linux + 194) #endif -#ifndef __NR_sched_setaffinity -#define __NR_sched_setaffinity (__NR_Linux + 195) -#define __NR_sched_getaffinity (__NR_Linux + 196) -#endif -#ifndef __NR_set_tid_address -#define __NR_set_tid_address (__NR_Linux + 212) -#endif -#ifndef __NR_clock_gettime -#define __NR_clock_gettime (__NR_Linux + 222) -#endif -#ifndef __NR_clock_getres -#define __NR_clock_getres (__NR_Linux + 223) -#endif #ifndef __NR_openat #define __NR_openat (__NR_Linux + 247) #endif #ifndef __NR_fstatat #define __NR_fstatat (__NR_Linux + 252) #endif -#ifndef __NR_unlinkat -#define __NR_unlinkat (__NR_Linux + 253) -#endif -#ifndef __NR_unshare -#define __NR_unshare (__NR_Linux + 262) -#endif -#ifndef __NR_move_pages -#define __NR_move_pages (__NR_Linux + 267) -#endif #ifndef __NR_getcpu #define __NR_getcpu (__NR_Linux + 271) #endif -#ifndef __NR_ioprio_set -#define __NR_ioprio_set (__NR_Linux + 273) -#endif -#ifndef __NR_ioprio_get -#define __NR_ioprio_get (__NR_Linux + 274) -#endif -#ifndef __NR_setns -#define __NR_setns (__NR_Linux + 303) -#endif /* End of MIPS (64bit API) definitions */ #else -#ifndef __NR_mount -#define __NR_mount (__NR_Linux + 160) -#endif -#ifndef __NR_setresuid -#define __NR_setresuid (__NR_Linux + 115) -#define __NR_getresuid (__NR_Linux + 116) -#define __NR_setresgid (__NR_Linux + 117) -#define __NR_getresgid (__NR_Linux + 118) -#endif -#ifndef __NR_capset -#define __NR_capset (__NR_Linux + 124) -#endif #ifndef __NR_gettid #define __NR_gettid (__NR_Linux + 178) #endif -#ifndef __NR_readahead -#define __NR_readahead (__NR_Linux + 179) -#endif -#ifndef __NR_setxattr -#define __NR_setxattr (__NR_Linux + 180) -#endif -#ifndef __NR_lsetxattr -#define __NR_lsetxattr (__NR_Linux + 181) -#endif -#ifndef __NR_getxattr -#define __NR_getxattr (__NR_Linux + 183) -#endif -#ifndef __NR_lgetxattr -#define __NR_lgetxattr (__NR_Linux + 184) -#endif -#ifndef __NR_listxattr -#define __NR_listxattr (__NR_Linux + 186) -#endif -#ifndef __NR_llistxattr -#define __NR_llistxattr (__NR_Linux + 187) -#endif -#ifndef __NR_tkill -#define __NR_tkill (__NR_Linux + 192) -#endif #ifndef __NR_futex #define __NR_futex (__NR_Linux + 194) #endif -#ifndef __NR_sched_setaffinity -#define __NR_sched_setaffinity (__NR_Linux + 195) -#define __NR_sched_getaffinity (__NR_Linux + 196) -#endif -#ifndef __NR_set_tid_address -#define __NR_set_tid_address (__NR_Linux + 213) -#endif -#ifndef __NR_statfs64 -#define __NR_statfs64 (__NR_Linux + 217) -#endif -#ifndef __NR_fstatfs64 -#define __NR_fstatfs64 (__NR_Linux + 218) -#endif -#ifndef __NR_clock_gettime -#define __NR_clock_gettime (__NR_Linux + 226) -#endif -#ifndef __NR_clock_getres -#define __NR_clock_getres (__NR_Linux + 227) -#endif #ifndef __NR_openat #define __NR_openat (__NR_Linux + 251) #endif #ifndef __NR_fstatat #define __NR_fstatat (__NR_Linux + 256) #endif -#ifndef __NR_unlinkat -#define __NR_unlinkat (__NR_Linux + 257) -#endif -#ifndef __NR_unshare -#define __NR_unshare (__NR_Linux + 266) -#endif -#ifndef __NR_move_pages -#define __NR_move_pages (__NR_Linux + 271) -#endif #ifndef __NR_getcpu #define __NR_getcpu (__NR_Linux + 275) #endif -#ifndef __NR_ioprio_set -#define __NR_ioprio_set (__NR_Linux + 277) -#endif -#ifndef __NR_ioprio_get -#define __NR_ioprio_get (__NR_Linux + 278) -#endif -#ifndef __NR_setns -#define __NR_setns (__NR_Linux + 308) -#endif /* End of MIPS (new 32bit API) definitions */ #endif /* End of MIPS definitions */ #elif defined(__PPC__) -#ifndef __NR_mount -#define __NR_mount 21 -#endif -#ifndef __NR_setfsuid -#define __NR_setfsuid 138 -#define __NR_setfsgid 139 -#endif -#ifndef __NR_setresuid -#define __NR_setresuid 164 -#define __NR_getresuid 165 -#define __NR_setresgid 169 -#define __NR_getresgid 170 -#endif #ifndef __NR_rt_sigaction -#define __NR_rt_sigreturn 172 #define __NR_rt_sigaction 173 #define __NR_rt_sigprocmask 174 -#define __NR_rt_sigpending 175 -#define __NR_rt_sigsuspend 178 -#endif -#ifndef __NR_pread64 -#define __NR_pread64 179 -#endif -#ifndef __NR_pwrite64 -#define __NR_pwrite64 180 -#endif -#ifndef __NR_capset -#define __NR_capset 184 #endif -#ifndef __NR_ugetrlimit -#define __NR_ugetrlimit 190 -#endif -#ifndef __NR_readahead -#define __NR_readahead 191 -#endif -#ifndef __PPC64__ #ifndef __NR_stat64 #define __NR_stat64 195 #endif #ifndef __NR_fstat64 #define __NR_fstat64 197 #endif -#endif /* !defined(__PPC64__) */ +#ifndef __NR_socket +#define __NR_socket 198 +#endif #ifndef __NR_getdents64 #define __NR_getdents64 202 #endif #ifndef __NR_gettid #define __NR_gettid 207 #endif -#ifndef __NR_tkill -#define __NR_tkill 208 +#ifndef __NR_futex +#define __NR_futex 221 +#endif +#ifndef __NR_openat +#define __NR_openat 286 +#endif +#ifndef __NR_getcpu +#define __NR_getcpu 302 +#endif +/* End of powerpc defininitions */ +#elif defined(__aarch64__) +#ifndef __NR_fstatat +#define __NR_fstatat 79 +#endif +/* End of aarch64 defininitions */ +#elif defined(__s390__) +#ifndef __NR_quotactl +#define __NR_quotactl 131 +#endif +#ifndef __NR_rt_sigreturn +#define __NR_rt_sigreturn 173 +#endif +#ifndef __NR_rt_sigaction +#define __NR_rt_sigaction 174 +#endif +#ifndef __NR_rt_sigprocmask +#define __NR_rt_sigprocmask 175 +#endif +#ifndef __NR_rt_sigpending +#define __NR_rt_sigpending 176 +#endif +#ifndef __NR_rt_sigsuspend +#define __NR_rt_sigsuspend 179 +#endif +#ifndef __NR_pread64 +#define __NR_pread64 180 +#endif +#ifndef __NR_pwrite64 +#define __NR_pwrite64 181 +#endif +#ifndef __NR_getdents64 +#define __NR_getdents64 220 +#endif +#ifndef __NR_readahead +#define __NR_readahead 222 #endif #ifndef __NR_setxattr -#define __NR_setxattr 209 +#define __NR_setxattr 224 #endif #ifndef __NR_lsetxattr -#define __NR_lsetxattr 210 +#define __NR_lsetxattr 225 #endif #ifndef __NR_getxattr -#define __NR_getxattr 212 +#define __NR_getxattr 227 #endif #ifndef __NR_lgetxattr -#define __NR_lgetxattr 213 +#define __NR_lgetxattr 228 #endif #ifndef __NR_listxattr -#define __NR_listxattr 215 +#define __NR_listxattr 230 #endif #ifndef __NR_llistxattr -#define __NR_llistxattr 216 +#define __NR_llistxattr 231 +#endif +#ifndef __NR_gettid +#define __NR_gettid 236 +#endif +#ifndef __NR_tkill +#define __NR_tkill 237 #endif #ifndef __NR_futex -#define __NR_futex 221 +#define __NR_futex 238 #endif #ifndef __NR_sched_setaffinity -#define __NR_sched_setaffinity 222 -#define __NR_sched_getaffinity 223 +#define __NR_sched_setaffinity 239 +#endif +#ifndef __NR_sched_getaffinity +#define __NR_sched_getaffinity 240 #endif #ifndef __NR_set_tid_address -#define __NR_set_tid_address 232 +#define __NR_set_tid_address 252 #endif #ifndef __NR_clock_gettime -#define __NR_clock_gettime 246 +#define __NR_clock_gettime 260 #endif #ifndef __NR_clock_getres -#define __NR_clock_getres 247 +#define __NR_clock_getres 261 #endif #ifndef __NR_statfs64 -#define __NR_statfs64 252 +#define __NR_statfs64 265 #endif #ifndef __NR_fstatfs64 -#define __NR_fstatfs64 253 -#endif -#ifndef __PPC64__ -#ifndef __NR_fadvise64_64 -#define __NR_fadvise64_64 254 +#define __NR_fstatfs64 266 #endif -#endif /* !defined(__PPC64__) */ #ifndef __NR_ioprio_set -#define __NR_ioprio_set 273 +#define __NR_ioprio_set 282 #endif #ifndef __NR_ioprio_get -#define __NR_ioprio_get 274 -#endif -#ifndef __NR_unshare -#define __NR_unshare 282 +#define __NR_ioprio_get 283 #endif #ifndef __NR_openat -#define __NR_openat 286 -#endif -#ifndef __PPC64__ -#ifndef __NR_fstatat64 -#define __NR_fstatat64 291 +#define __NR_openat 288 #endif -#endif /* !defined(__PPC64__) */ #ifndef __NR_unlinkat -#define __NR_unlinkat 292 +#define __NR_unlinkat 294 #endif #ifndef __NR_move_pages -#define __NR_move_pages 301 +#define __NR_move_pages 310 #endif #ifndef __NR_getcpu -#define __NR_getcpu 302 -#endif -#ifndef __NR_setns -#define __NR_setns 350 +#define __NR_getcpu 311 #endif -/* End of powerpc defininitions */ +#ifndef __NR_fallocate +#define __NR_fallocate 314 +#endif +/* Some syscalls are named/numbered differently between s390 and s390x. */ +#ifdef __s390x__ +# ifndef __NR_getrlimit +# define __NR_getrlimit 191 +# endif +# ifndef __NR_setresuid +# define __NR_setresuid 208 +# endif +# ifndef __NR_getresuid +# define __NR_getresuid 209 +# endif +# ifndef __NR_setresgid +# define __NR_setresgid 210 +# endif +# ifndef __NR_getresgid +# define __NR_getresgid 211 +# endif +# ifndef __NR_setfsuid +# define __NR_setfsuid 215 +# endif +# ifndef __NR_setfsgid +# define __NR_setfsgid 216 +# endif +# ifndef __NR_fadvise64 +# define __NR_fadvise64 253 +# endif +# ifndef __NR_newfstatat +# define __NR_newfstatat 293 +# endif +#else /* __s390x__ */ +# ifndef __NR_getrlimit +# define __NR_getrlimit 76 +# endif +# ifndef __NR_setfsuid +# define __NR_setfsuid 138 +# endif +# ifndef __NR_setfsgid +# define __NR_setfsgid 139 +# endif +# ifndef __NR_setresuid +# define __NR_setresuid 164 +# endif +# ifndef __NR_getresuid +# define __NR_getresuid 165 +# endif +# ifndef __NR_setresgid +# define __NR_setresgid 170 +# endif +# ifndef __NR_getresgid +# define __NR_getresgid 171 +# endif +# ifndef __NR_ugetrlimit +# define __NR_ugetrlimit 191 +# endif +# ifndef __NR_mmap2 +# define __NR_mmap2 192 +# endif +# ifndef __NR_setresuid32 +# define __NR_setresuid32 208 +# endif +# ifndef __NR_getresuid32 +# define __NR_getresuid32 209 +# endif +# ifndef __NR_setresgid32 +# define __NR_setresgid32 210 +# endif +# ifndef __NR_getresgid32 +# define __NR_getresgid32 211 +# endif +# ifndef __NR_setfsuid32 +# define __NR_setfsuid32 215 +# endif +# ifndef __NR_setfsgid32 +# define __NR_setfsgid32 216 +# endif +# ifndef __NR_fadvise64_64 +# define __NR_fadvise64_64 264 +# endif +# ifndef __NR_fstatat64 +# define __NR_fstatat64 293 +# endif +#endif /* __s390__ */ +/* End of s390/s390x definitions */ #endif @@ -1584,7 +1000,8 @@ struct kernel_io_event { #endif #undef LSS_RETURN - #if (defined(__i386__) || defined(__x86_64__) || defined(__arm__)) + #if (defined(__i386__) || defined(__x86_64__) || defined(__arm__) || \ + defined(__aarch64__) || defined(__s390__)) /* Failing system calls return a negative result in the range of * -1..-4095. These are "errno" values with the sign inverted. */ @@ -1818,35 +1235,6 @@ struct kernel_io_event { LSS_RETURN(int, __res); } - #define __NR__fadvise64_64 __NR_fadvise64_64 - LSS_INLINE _syscall6(int, _fadvise64_64, int, fd, - unsigned, offset_lo, unsigned, offset_hi, - unsigned, len_lo, unsigned, len_hi, - int, advice) - - LSS_INLINE int LSS_NAME(fadvise64)(int fd, loff_t offset, - loff_t len, int advice) { - return LSS_NAME(_fadvise64_64)(fd, - (unsigned)offset, (unsigned)(offset >>32), - (unsigned)len, (unsigned)(len >> 32), - advice); - } - - #define __NR__fallocate __NR_fallocate - LSS_INLINE _syscall6(int, _fallocate, int, fd, - int, mode, - unsigned, offset_lo, unsigned, offset_hi, - unsigned, len_lo, unsigned, len_hi) - - LSS_INLINE int LSS_NAME(fallocate)(int fd, int mode, - loff_t offset, loff_t len) { - union { loff_t off; unsigned w[2]; } o = { offset }, l = { len }; - return LSS_NAME(_fallocate)(fd, mode, o.w[0], o.w[1], l.w[0], l.w[1]); - } - - LSS_INLINE _syscall1(int, set_thread_area, void *, u) - LSS_INLINE _syscall1(int, get_thread_area, void *, u) - LSS_INLINE void (*LSS_NAME(restore_rt)(void))(void) { /* On i386, the kernel does not know how to return from a signal * handler. Instead, it relies on user space to provide a @@ -1891,74 +1279,141 @@ struct kernel_io_event { * location (e.g. when using the clone() system call with the CLONE_VM * option). */ + #undef LSS_ENTRYPOINT + #define LSS_ENTRYPOINT "syscall\n" + + /* The x32 ABI has 32 bit longs, but the syscall interface is 64 bit. + * We need to explicitly cast to an unsigned 64 bit type to avoid implicit + * sign extension. We can't cast pointers directly because those are + * 32 bits, and gcc will dump ugly warnings about casting from a pointer + * to an integer of a different size. + */ + #undef LSS_SYSCALL_ARG + #define LSS_SYSCALL_ARG(a) ((uint64_t)(uintptr_t)(a)) + #undef _LSS_RETURN + #define _LSS_RETURN(type, res, cast) \ + do { \ + if ((uint64_t)(res) >= (uint64_t)(-4095)) { \ + LSS_ERRNO = -(res); \ + res = -1; \ + } \ + return (type)(cast)(res); \ + } while (0) + #undef LSS_RETURN + #define LSS_RETURN(type, res) _LSS_RETURN(type, res, uintptr_t) + + #undef _LSS_BODY + #define _LSS_BODY(nr, type, name, cast, ...) \ + long long __res; \ + __asm__ __volatile__(LSS_BODY_ASM##nr LSS_ENTRYPOINT \ + : "=a" (__res) \ + : "0" (__NR_##name) LSS_BODY_ARG##nr(__VA_ARGS__) \ + : LSS_BODY_CLOBBER##nr "r11", "rcx", "memory"); \ + _LSS_RETURN(type, __res, cast) #undef LSS_BODY - #define LSS_BODY(type,name, ...) \ - long __res; \ - __asm__ __volatile__("syscall" : "=a" (__res) : "0" (__NR_##name), \ - ##__VA_ARGS__ : "r11", "rcx", "memory"); \ - LSS_RETURN(type, __res) + #define LSS_BODY(nr, type, name, args...) \ + _LSS_BODY(nr, type, name, uintptr_t, ## args) + + #undef LSS_BODY_ASM0 + #undef LSS_BODY_ASM1 + #undef LSS_BODY_ASM2 + #undef LSS_BODY_ASM3 + #undef LSS_BODY_ASM4 + #undef LSS_BODY_ASM5 + #undef LSS_BODY_ASM6 + #define LSS_BODY_ASM0 + #define LSS_BODY_ASM1 LSS_BODY_ASM0 + #define LSS_BODY_ASM2 LSS_BODY_ASM1 + #define LSS_BODY_ASM3 LSS_BODY_ASM2 + #define LSS_BODY_ASM4 LSS_BODY_ASM3 "movq %5,%%r10;" + #define LSS_BODY_ASM5 LSS_BODY_ASM4 "movq %6,%%r8;" + #define LSS_BODY_ASM6 LSS_BODY_ASM5 "movq %7,%%r9;" + + #undef LSS_BODY_CLOBBER0 + #undef LSS_BODY_CLOBBER1 + #undef LSS_BODY_CLOBBER2 + #undef LSS_BODY_CLOBBER3 + #undef LSS_BODY_CLOBBER4 + #undef LSS_BODY_CLOBBER5 + #undef LSS_BODY_CLOBBER6 + #define LSS_BODY_CLOBBER0 + #define LSS_BODY_CLOBBER1 LSS_BODY_CLOBBER0 + #define LSS_BODY_CLOBBER2 LSS_BODY_CLOBBER1 + #define LSS_BODY_CLOBBER3 LSS_BODY_CLOBBER2 + #define LSS_BODY_CLOBBER4 LSS_BODY_CLOBBER3 "r10", + #define LSS_BODY_CLOBBER5 LSS_BODY_CLOBBER4 "r8", + #define LSS_BODY_CLOBBER6 LSS_BODY_CLOBBER5 "r9", + + #undef LSS_BODY_ARG0 + #undef LSS_BODY_ARG1 + #undef LSS_BODY_ARG2 + #undef LSS_BODY_ARG3 + #undef LSS_BODY_ARG4 + #undef LSS_BODY_ARG5 + #undef LSS_BODY_ARG6 + #define LSS_BODY_ARG0() + #define LSS_BODY_ARG1(arg1) \ + LSS_BODY_ARG0(), "D" (arg1) + #define LSS_BODY_ARG2(arg1, arg2) \ + LSS_BODY_ARG1(arg1), "S" (arg2) + #define LSS_BODY_ARG3(arg1, arg2, arg3) \ + LSS_BODY_ARG2(arg1, arg2), "d" (arg3) + #define LSS_BODY_ARG4(arg1, arg2, arg3, arg4) \ + LSS_BODY_ARG3(arg1, arg2, arg3), "r" (arg4) + #define LSS_BODY_ARG5(arg1, arg2, arg3, arg4, arg5) \ + LSS_BODY_ARG4(arg1, arg2, arg3, arg4), "r" (arg5) + #define LSS_BODY_ARG6(arg1, arg2, arg3, arg4, arg5, arg6) \ + LSS_BODY_ARG5(arg1, arg2, arg3, arg4, arg5), "r" (arg6) + #undef _syscall0 #define _syscall0(type,name) \ type LSS_NAME(name)() { \ - LSS_BODY(type, name); \ + LSS_BODY(0, type, name); \ } #undef _syscall1 #define _syscall1(type,name,type1,arg1) \ type LSS_NAME(name)(type1 arg1) { \ - LSS_BODY(type, name, "D" ((long)(arg1))); \ + LSS_BODY(1, type, name, LSS_SYSCALL_ARG(arg1)); \ } #undef _syscall2 #define _syscall2(type,name,type1,arg1,type2,arg2) \ type LSS_NAME(name)(type1 arg1, type2 arg2) { \ - LSS_BODY(type, name, "D" ((long)(arg1)), "S" ((long)(arg2))); \ + LSS_BODY(2, type, name, LSS_SYSCALL_ARG(arg1), LSS_SYSCALL_ARG(arg2));\ } #undef _syscall3 #define _syscall3(type,name,type1,arg1,type2,arg2,type3,arg3) \ type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3) { \ - LSS_BODY(type, name, "D" ((long)(arg1)), "S" ((long)(arg2)), \ - "d" ((long)(arg3))); \ + LSS_BODY(3, type, name, LSS_SYSCALL_ARG(arg1), LSS_SYSCALL_ARG(arg2), \ + LSS_SYSCALL_ARG(arg3)); \ } #undef _syscall4 #define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4) \ type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4) { \ - long __res; \ - __asm__ __volatile__("movq %5,%%r10; syscall" : \ - "=a" (__res) : "0" (__NR_##name), \ - "D" ((long)(arg1)), "S" ((long)(arg2)), "d" ((long)(arg3)), \ - "r" ((long)(arg4)) : "r10", "r11", "rcx", "memory"); \ - LSS_RETURN(type, __res); \ + LSS_BODY(4, type, name, LSS_SYSCALL_ARG(arg1), LSS_SYSCALL_ARG(arg2), \ + LSS_SYSCALL_ARG(arg3), LSS_SYSCALL_ARG(arg4));\ } #undef _syscall5 #define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \ type5,arg5) \ type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \ type5 arg5) { \ - long __res; \ - __asm__ __volatile__("movq %5,%%r10; movq %6,%%r8; syscall" : \ - "=a" (__res) : "0" (__NR_##name), \ - "D" ((long)(arg1)), "S" ((long)(arg2)), "d" ((long)(arg3)), \ - "r" ((long)(arg4)), "r" ((long)(arg5)) : \ - "r8", "r10", "r11", "rcx", "memory"); \ - LSS_RETURN(type, __res); \ + LSS_BODY(5, type, name, LSS_SYSCALL_ARG(arg1), LSS_SYSCALL_ARG(arg2), \ + LSS_SYSCALL_ARG(arg3), LSS_SYSCALL_ARG(arg4), \ + LSS_SYSCALL_ARG(arg5)); \ } #undef _syscall6 #define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \ type5,arg5,type6,arg6) \ type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \ type5 arg5, type6 arg6) { \ - long __res; \ - __asm__ __volatile__("movq %5,%%r10; movq %6,%%r8; movq %7,%%r9;" \ - "syscall" : \ - "=a" (__res) : "0" (__NR_##name), \ - "D" ((long)(arg1)), "S" ((long)(arg2)), "d" ((long)(arg3)), \ - "r" ((long)(arg4)), "r" ((long)(arg5)), "r" ((long)(arg6)) : \ - "r8", "r9", "r10", "r11", "rcx", "memory"); \ - LSS_RETURN(type, __res); \ + LSS_BODY(6, type, name, LSS_SYSCALL_ARG(arg1), LSS_SYSCALL_ARG(arg2), \ + LSS_SYSCALL_ARG(arg3), LSS_SYSCALL_ARG(arg4), \ + LSS_SYSCALL_ARG(arg5), LSS_SYSCALL_ARG(arg6));\ } LSS_INLINE int LSS_NAME(clone)(int (*fn)(void *), void *child_stack, int flags, void *arg, int *parent_tidptr, void *newtls, int *child_tidptr) { - long __res; + long long __res; { __asm__ __volatile__(/* if (fn == NULL) * return -EINVAL; @@ -2023,15 +1478,17 @@ struct kernel_io_event { "1:\n" : "=a" (__res) : "0"(-EINVAL), "i"(__NR_clone), "i"(__NR_exit), - "r"(fn), "S"(child_stack), "D"(flags), "r"(arg), - "d"(parent_tidptr), "g"(newtls), "g"(child_tidptr) + "r"(LSS_SYSCALL_ARG(fn)), + "S"(LSS_SYSCALL_ARG(child_stack)), + "D"(LSS_SYSCALL_ARG(flags)), + "r"(LSS_SYSCALL_ARG(arg)), + "d"(LSS_SYSCALL_ARG(parent_tidptr)), + "r"(LSS_SYSCALL_ARG(newtls)), + "r"(LSS_SYSCALL_ARG(child_tidptr)) : "rsp", "memory", "r8", "r10", "r11", "rcx"); } LSS_RETURN(int, __res); } - LSS_INLINE _syscall2(int, arch_prctl, int, c, void *, a) - LSS_INLINE _syscall4(int, fadvise64, int, fd, loff_t, offset, loff_t, len, - int, advice) LSS_INLINE void (*LSS_NAME(restore_rt)(void))(void) { /* On x86-64, the kernel does not know how to return from @@ -2040,7 +1497,7 @@ struct kernel_io_event { * Unfortunately, we cannot just reference the glibc version of this * function, as glibc goes out of its way to make it inaccessible. */ - void (*res)(void); + long long res; __asm__ __volatile__("call 2f\n" "0:.align 16\n" "1:movq %1,%%rax\n" @@ -2049,7 +1506,7 @@ struct kernel_io_event { "addq $(1b-0b),%0\n" : "=a" (res) : "i" (__NR_rt_sigreturn)); - return res; + return (void (*)(void))(uintptr_t)res; } #elif defined(__arm__) /* Most definitions of _syscallX() neglect to mark "memory" as being @@ -2247,14 +1704,24 @@ struct kernel_io_event { #undef LSS_REG #define LSS_REG(r,a) register unsigned long __r##r __asm__("$"#r) = \ (unsigned long)(a) + + #if _MIPS_SIM == _MIPS_SIM_ABI32 + // See http://sources.redhat.com/ml/libc-alpha/2004-10/msg00050.html + // or http://www.linux-mips.org/archives/linux-mips/2004-10/msg00142.html + #define MIPS_SYSCALL_CLOBBERS "$1", "$3", "$8", "$9", "$10", "$11", "$12",\ + "$13", "$14", "$15", "$24", "$25", "memory" + #else + #define MIPS_SYSCALL_CLOBBERS "$1", "$3", "$10", "$11", "$12", "$13", \ + "$14", "$15", "$24", "$25", "memory" + #endif + #undef LSS_BODY #define LSS_BODY(type,name,r7,...) \ register unsigned long __v0 __asm__("$2") = __NR_##name; \ __asm__ __volatile__ ("syscall\n" \ : "=&r"(__v0), r7 (__r7) \ : "0"(__v0), ##__VA_ARGS__ \ - : "$8", "$9", "$10", "$11", "$12", \ - "$13", "$14", "$15", "$24", "memory"); \ + : MIPS_SYSCALL_CLOBBERS); \ LSS_RETURN(type, __v0, __r7) #undef _syscall0 #define _syscall0(type, name) \ @@ -2312,8 +1779,7 @@ struct kernel_io_event { : "=&r"(__v0), "+r" (__r7) \ : "i" (__NR_##name), "r"(__r4), "r"(__r5), \ "r"(__r6), "m" ((unsigned long)arg5) \ - : "$8", "$9", "$10", "$11", "$12", \ - "$13", "$14", "$15", "$24", "memory"); \ + : MIPS_SYSCALL_CLOBBERS); \ LSS_RETURN(type, __v0, __r7); \ } #else @@ -2351,10 +1817,9 @@ struct kernel_io_event { ".set reorder\n" \ : "=&r"(__v0), "+r" (__r7) \ : "i" (__NR_##name), "r"(__r4), "r"(__r5), \ - "r"(__r6), "r" ((unsigned long)arg5), \ - "r" ((unsigned long)arg6) \ - : "$8", "$9", "$10", "$11", "$12", \ - "$13", "$14", "$15", "$24", "memory"); \ + "r"(__r6), "m" ((unsigned long)arg5), \ + "m" ((unsigned long)arg6) \ + : MIPS_SYSCALL_CLOBBERS); \ LSS_RETURN(type, __v0, __r7); \ } #else @@ -2509,13 +1974,13 @@ struct kernel_io_event { #define LSS_BODY(nr, type, name, args...) \ long __sc_ret, __sc_err; \ { \ - register unsigned long __sc_0 __asm__ ("r0"); \ - register unsigned long __sc_3 __asm__ ("r3"); \ - register unsigned long __sc_4 __asm__ ("r4"); \ - register unsigned long __sc_5 __asm__ ("r5"); \ - register unsigned long __sc_6 __asm__ ("r6"); \ - register unsigned long __sc_7 __asm__ ("r7"); \ - register unsigned long __sc_8 __asm__ ("r8"); \ + register unsigned long __sc_0 __asm__ ("r0"); \ + register unsigned long __sc_3 __asm__ ("r3"); \ + register unsigned long __sc_4 __asm__ ("r4"); \ + register unsigned long __sc_5 __asm__ ("r5"); \ + register unsigned long __sc_6 __asm__ ("r6"); \ + register unsigned long __sc_7 __asm__ ("r7"); \ + register unsigned long __sc_8 __asm__ ("r8"); \ \ LSS_LOADARGS_##nr(name, args); \ __asm__ __volatile__ \ @@ -2572,26 +2037,98 @@ struct kernel_io_event { type5 arg5, type6 arg6) { \ LSS_BODY(6, type, name, arg1, arg2, arg3, arg4, arg5, arg6); \ } - - #undef LSS_PPC_MINIMUM_FRAME_SIZE - #undef LSS_SIZE_S - #ifdef __PPC64__ - #define LSS_PPC_MINIMUM_FRAME_SIZE 112 - #define LSS_SIZE_S "d" - #else - #define LSS_PPC_MINIMUM_FRAME_SIZE 16 - #define LSS_SIZE_S "w" - #endif - - /* clone function adapted from glibc 2.3.6 clone.S */ - /* TODO(user): consider wrapping some args up in a struct, like we - * do for i386's _syscall6, so we can compile successfully on gcc 2.95 - */ + /* clone function adapted from glibc 2.18 clone.S */ LSS_INLINE int LSS_NAME(clone)(int (*fn)(void *), void *child_stack, int flags, void *arg, int *parent_tidptr, void *newtls, int *child_tidptr) { long __ret, __err; { +#if defined(__PPC64__) + +/* Stack frame offsets. */ +#if _CALL_ELF != 2 +#define FRAME_MIN_SIZE 112 +#define FRAME_TOC_SAVE 40 +#else +#define FRAME_MIN_SIZE 32 +#define FRAME_TOC_SAVE 24 +#endif + + + register int (*__fn)(void *) __asm__ ("r3") = fn; + register void *__cstack __asm__ ("r4") = child_stack; + register int __flags __asm__ ("r5") = flags; + register void * __arg __asm__ ("r6") = arg; + register int * __ptidptr __asm__ ("r7") = parent_tidptr; + register void * __newtls __asm__ ("r8") = newtls; + register int * __ctidptr __asm__ ("r9") = child_tidptr; + __asm__ __volatile__( + /* check for fn == NULL + * and child_stack == NULL + */ + "cmpdi cr0, %6, 0\n\t" + "cmpdi cr1, %7, 0\n\t" + "cror cr0*4+eq, cr1*4+eq, cr0*4+eq\n\t" + "beq- cr0, 1f\n\t" + + /* set up stack frame for child */ + "clrrdi %7, %7, 4\n\t" + "li 0, 0\n\t" + "stdu 0, -%13(%7)\n\t" + + /* fn, arg, child_stack are saved acrVoss the syscall */ + "mr 28, %6\n\t" + "mr 29, %7\n\t" + "mr 27, %9\n\t" + + /* syscall + r3 == flags + r4 == child_stack + r5 == parent_tidptr + r6 == newtls + r7 == child_tidptr */ + "mr 3, %8\n\t" + "mr 5, %10\n\t" + "mr 6, %11\n\t" + "mr 7, %12\n\t" + "li 0, %4\n\t" + "sc\n\t" + + /* Test if syscall was successful */ + "cmpdi cr1, 3, 0\n\t" + "crandc cr1*4+eq, cr1*4+eq, cr0*4+so\n\t" + "bne- cr1, 1f\n\t" + + /* Do the function call */ + "std 2, %14(1)\n\t" +#if _CALL_ELF != 2 + "ld 0, 0(28)\n\t" + "ld 2, 8(28)\n\t" + "mtctr 0\n\t" +#else + "mr 12, 28\n\t" + "mtctr 12\n\t" +#endif + "mr 3, 27\n\t" + "bctrl\n\t" + "ld 2, %14(1)\n\t" + + /* Call _exit(r3) */ + "li 0, %5\n\t" + "sc\n\t" + + /* Return to parent */ + "1:\n\t" + "mr %0, 3\n\t" + : "=r" (__ret), "=r" (__err) + : "0" (-1), "i" (EINVAL), + "i" (__NR_clone), "i" (__NR_exit), + "r" (__fn), "r" (__cstack), "r" (__flags), + "r" (__arg), "r" (__ptidptr), "r" (__newtls), + "r" (__ctidptr), "i" (FRAME_MIN_SIZE), "i" (FRAME_TOC_SAVE) + : "cr0", "cr1", "memory", "ctr", + "r0", "r29", "r27", "r28"); +#else register int (*__fn)(void *) __asm__ ("r8") = fn; register void *__cstack __asm__ ("r4") = child_stack; register int __flags __asm__ ("r3") = flags; @@ -2603,17 +2140,17 @@ struct kernel_io_event { /* check for fn == NULL * and child_stack == NULL */ - "cmp" LSS_SIZE_S "i cr0, %6, 0\n\t" - "cmp" LSS_SIZE_S "i cr1, %7, 0\n\t" + "cmpwi cr0, %6, 0\n\t" + "cmpwi cr1, %7, 0\n\t" "cror cr0*4+eq, cr1*4+eq, cr0*4+eq\n\t" "beq- cr0, 1f\n\t" /* set up stack frame for child */ - "clrr" LSS_SIZE_S "i %7, %7, 4\n\t" + "clrrwi %7, %7, 4\n\t" "li 0, 0\n\t" - "st" LSS_SIZE_S "u 0, %13(%7)\n\t" + "stwu 0, -16(%7)\n\t" - /* fn, arg, child_stack are saved across the syscall: r27-29 */ + /* fn, arg, child_stack are saved across the syscall: r28-30 */ "mr 28, %6\n\t" "mr 29, %7\n\t" "mr 27, %9\n\t" @@ -2629,304 +2166,403 @@ struct kernel_io_event { "sc\n\t" /* Test if syscall was successful */ - "cmp" LSS_SIZE_S "i cr1, 3, 0\n\t" + "cmpwi cr1, 3, 0\n\t" "crandc cr1*4+eq, cr1*4+eq, cr0*4+so\n\t" "bne- cr1, 1f\n\t" - /* Do the function call. On PowerPC64, a function pointer points - * a function descriptor instead of the first instruction. We need - * to load the callee's entry point and TOC from the descriptor. - * Since the callee may have a differet TOC, we also need to - * save and restore caller's TOC around the call. - */ - - #ifdef __PPC64__ - "std 2, 40(1)\n\t" /* Save caller's TOC. */ - "ld 4, 0(28)\n\t" /* Get callee's entry address. */ - "ld 2, 8(28)\n\t" /* Load calee's TOC. */ - "mtctr 4\n\t" + /* Do the function call */ + "mtctr 28\n\t" "mr 3, 27\n\t" "bctrl\n\t" - "ld 2, 40(1)\n\t" /* Restore caller's TOC after call. */ + + /* Call _exit(r3) */ + "li 0, %5\n\t" + "sc\n\t" + + /* Return to parent */ + "1:\n" + "mfcr %1\n\t" + "mr %0, 3\n\t" + : "=r" (__ret), "=r" (__err) + : "0" (-1), "1" (EINVAL), + "i" (__NR_clone), "i" (__NR_exit), + "r" (__fn), "r" (__cstack), "r" (__flags), + "r" (__arg), "r" (__ptidptr), "r" (__newtls), + "r" (__ctidptr) + : "cr0", "cr1", "memory", "ctr", + "r0", "r29", "r27", "r28"); + +#endif + } + LSS_RETURN(int, __ret, __err); + } + #elif defined(__aarch64__) + #undef LSS_REG + #define LSS_REG(r,a) register long __x##r __asm__("x"#r) = (long)a + #undef LSS_BODY + #define LSS_BODY(type,name,args...) \ + register long __res_x0 __asm__("x0"); \ + long __res; \ + __asm__ __volatile__ ("mov x8, %1\n" \ + "svc 0x0\n" \ + : "=r"(__res_x0) \ + : "i"(__NR_##name) , ## args \ + : "memory"); \ + __res = __res_x0; \ + LSS_RETURN(type, __res) + #undef _syscall0 + #define _syscall0(type, name) \ + type LSS_NAME(name)(void) { \ + LSS_BODY(type, name); \ + } + #undef _syscall1 + #define _syscall1(type, name, type1, arg1) \ + type LSS_NAME(name)(type1 arg1) { \ + LSS_REG(0, arg1); LSS_BODY(type, name, "r"(__x0)); \ + } + #undef _syscall2 + #define _syscall2_long(type, name, svc, type1, arg1, type2, arg2) \ + type LSS_NAME(name)(type1 arg1, type2 arg2) { \ + LSS_REG(0, arg1); LSS_REG(1, arg2); \ + LSS_BODY(type, svc, "r"(__x0), "r"(__x1)); \ + } + #define _syscall2(type, name, type1, arg1, type2, arg2) \ + _syscall2_long(type, name, name, type1, arg1, type2, arg2) + #undef _syscall3 + #define _syscall3_long(type, name, svc, type1, arg1, type2, arg2, \ + type3, arg3) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3) { \ + LSS_REG(0, arg1); LSS_REG(1, arg2); LSS_REG(2, arg3); \ + LSS_BODY(type, svc, "r"(__x0), "r"(__x1), "r"(__x2)); \ + } + #define _syscall3(type, name, type1, arg1, type2, arg2, type3, arg3) \ + _syscall3_long(type, name, name, type1, arg1, type2, arg2, \ + type3, arg3) + #undef _syscall4 + #define _syscall4(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4) { \ + LSS_REG(0, arg1); LSS_REG(1, arg2); LSS_REG(2, arg3); \ + LSS_REG(3, arg4); \ + LSS_BODY(type, name, "r"(__x0), "r"(__x1), "r"(__x2), "r"(__x3)); \ + } + #undef _syscall5 + #define _syscall5(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \ + type5,arg5) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \ + type5 arg5) { \ + LSS_REG(0, arg1); LSS_REG(1, arg2); LSS_REG(2, arg3); \ + LSS_REG(3, arg4); LSS_REG(4, arg5); \ + LSS_BODY(type, name, "r"(__x0), "r"(__x1), "r"(__x2), "r"(__x3), \ + "r"(__x4)); \ + } + #undef _syscall6 + #define _syscall6_long(type,name,svc,type1,arg1,type2,arg2,type3,arg3, \ + type4,arg4,type5,arg5,type6,arg6) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, type4 arg4, \ + type5 arg5, type6 arg6) { \ + LSS_REG(0, arg1); LSS_REG(1, arg2); LSS_REG(2, arg3); \ + LSS_REG(3, arg4); LSS_REG(4, arg5); LSS_REG(5, arg6); \ + LSS_BODY(type, svc, "r"(__x0), "r"(__x1), "x"(__x2), "r"(__x3), \ + "r"(__x4), "r"(__x5)); \ + } + #define _syscall6(type,name,type1,arg1,type2,arg2,type3,arg3,type4,arg4, \ + type5,arg5,type6,arg6) \ + _syscall6_long(type,name,name,type1,arg1,type2,arg2,type3,arg3, \ + type4,arg4,type5,arg5,type6,arg6) + /* clone function adapted from glibc 2.18 clone.S */ + LSS_INLINE int LSS_NAME(clone)(int (*fn)(void *), void *child_stack, + int flags, void *arg, int *parent_tidptr, + void *newtls, int *child_tidptr) { + long __res; + { + register int (*__fn)(void *) __asm__("x0") = fn; + register void *__stack __asm__("x1") = child_stack; + register int __flags __asm__("x2") = flags; + register void *__arg __asm__("x3") = arg; + register int *__ptid __asm__("x4") = parent_tidptr; + register void *__tls __asm__("x5") = newtls; + register int *__ctid __asm__("x6") = child_tidptr; + __asm__ __volatile__(/* if (fn == NULL || child_stack == NULL) + * return -EINVAL; + */ + "cbz x0,1f\n" + "cbz x1,1f\n" + + /* Push "arg" and "fn" onto the stack that will be + * used by the child. + */ + "stp x0,x3, [x1, #-16]!\n" + + "mov x0,x2\n" /* flags */ + "mov x2,x4\n" /* ptid */ + "mov x3,x5\n" /* tls */ + "mov x4,x6\n" /* ctid */ + "mov x8,%9\n" /* clone */ + + "svc 0x0\n" + + /* if (%r0 != 0) + * return %r0; + */ + "cmp x0, #0\n" + "bne 2f\n" + + /* In the child, now. Call "fn(arg)". + */ + "ldp x1, x0, [sp], #16\n" + "blr x1\n" + + /* Call _exit(%r0). + */ + "mov x8, %10\n" + "svc 0x0\n" + "1:\n" + "mov x8, %1\n" + "2:\n" + : "=r" (__res) + : "i"(-EINVAL), + "r"(__fn), "r"(__stack), "r"(__flags), "r"(__arg), + "r"(__ptid), "r"(__tls), "r"(__ctid), + "i"(__NR_clone), "i"(__NR_exit) + : "x30", "memory"); + } + LSS_RETURN(int, __res); + } + #elif defined(__s390__) + #undef LSS_REG + #define LSS_REG(r, a) register unsigned long __r##r __asm__("r"#r) = (unsigned long) a + #undef LSS_BODY + #define LSS_BODY(type, name, args...) \ + register unsigned long __nr __asm__("r1") \ + = (unsigned long)(__NR_##name); \ + register long __res_r2 __asm__("r2"); \ + long __res; \ + __asm__ __volatile__ \ + ("svc 0\n\t" \ + : "=d"(__res_r2) \ + : "d"(__nr), ## args \ + : "memory"); \ + __res = __res_r2; \ + LSS_RETURN(type, __res) + #undef _syscall0 + #define _syscall0(type, name) \ + type LSS_NAME(name)(void) { \ + LSS_BODY(type, name); \ + } + #undef _syscall1 + #define _syscall1(type, name, type1, arg1) \ + type LSS_NAME(name)(type1 arg1) { \ + LSS_REG(2, arg1); \ + LSS_BODY(type, name, "0"(__r2)); \ + } + #undef _syscall2 + #define _syscall2(type, name, type1, arg1, type2, arg2) \ + type LSS_NAME(name)(type1 arg1, type2 arg2) { \ + LSS_REG(2, arg1); LSS_REG(3, arg2); \ + LSS_BODY(type, name, "0"(__r2), "d"(__r3)); \ + } + #undef _syscall3 + #define _syscall3(type, name, type1, arg1, type2, arg2, type3, arg3) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3) { \ + LSS_REG(2, arg1); LSS_REG(3, arg2); LSS_REG(4, arg3); \ + LSS_BODY(type, name, "0"(__r2), "d"(__r3), "d"(__r4)); \ + } + #undef _syscall4 + #define _syscall4(type, name, type1, arg1, type2, arg2, type3, arg3, \ + type4, arg4) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, \ + type4 arg4) { \ + LSS_REG(2, arg1); LSS_REG(3, arg2); LSS_REG(4, arg3); \ + LSS_REG(5, arg4); \ + LSS_BODY(type, name, "0"(__r2), "d"(__r3), "d"(__r4), \ + "d"(__r5)); \ + } + #undef _syscall5 + #define _syscall5(type, name, type1, arg1, type2, arg2, type3, arg3, \ + type4, arg4, type5, arg5) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, \ + type4 arg4, type5 arg5) { \ + LSS_REG(2, arg1); LSS_REG(3, arg2); LSS_REG(4, arg3); \ + LSS_REG(5, arg4); LSS_REG(6, arg5); \ + LSS_BODY(type, name, "0"(__r2), "d"(__r3), "d"(__r4), \ + "d"(__r5), "d"(__r6)); \ + } + #undef _syscall6 + #define _syscall6(type, name, type1, arg1, type2, arg2, type3, arg3, \ + type4, arg4, type5, arg5, type6, arg6) \ + type LSS_NAME(name)(type1 arg1, type2 arg2, type3 arg3, \ + type4 arg4, type5 arg5, type6 arg6) { \ + LSS_REG(2, arg1); LSS_REG(3, arg2); LSS_REG(4, arg3); \ + LSS_REG(5, arg4); LSS_REG(6, arg5); LSS_REG(7, arg6); \ + LSS_BODY(type, name, "0"(__r2), "d"(__r3), "d"(__r4), \ + "d"(__r5), "d"(__r6), "d"(__r7)); \ + } + LSS_INLINE int LSS_NAME(clone)(int (*fn)(void *), void *child_stack, + int flags, void *arg, int *parent_tidptr, + void *newtls, int *child_tidptr) { + long __ret; + { + register int (*__fn)(void *) __asm__ ("r1") = fn; + register void *__cstack __asm__ ("r2") = child_stack; + register int __flags __asm__ ("r3") = flags; + register void *__arg __asm__ ("r0") = arg; + register int *__ptidptr __asm__ ("r4") = parent_tidptr; + register void *__newtls __asm__ ("r6") = newtls; + register int *__ctidptr __asm__ ("r5") = child_tidptr; + __asm__ __volatile__ ( + #ifndef __s390x__ + /* arg already in r0 */ + "ltr %4, %4\n\t" /* check fn, which is already in r1 */ + "jz 1f\n\t" /* NULL function pointer, return -EINVAL */ + "ltr %5, %5\n\t" /* check child_stack, which is already in r2 */ + "jz 1f\n\t" /* NULL stack pointer, return -EINVAL */ + /* flags already in r3 */ + /* parent_tidptr already in r4 */ + /* child_tidptr already in r5 */ + /* newtls already in r6 */ + "svc %2\n\t" /* invoke clone syscall */ + "ltr %0,%%r2\n\t" /* load return code into __ret and test */ + "jnz 1f\n\t" /* return to parent if non-zero */ + /* start child thread */ + "lr %%r2, %7\n\t" /* set first parameter to void *arg */ + "ahi %%r15, -96\n\t" /* make room on the stack for the save area */ + "xc 0(4,%%r15), 0(%%r15)\n\t" + "basr %%r14, %4\n\t" /* jump to fn */ + "svc %3\n" /* invoke exit syscall */ + "1:\n" #else - "mtctr 28\n\t" - "mr 3, 27\n\t" - "bctrl\n\t" + /* arg already in r0 */ + "ltgr %4, %4\n\t" /* check fn, which is already in r1 */ + "jz 1f\n\t" /* NULL function pointer, return -EINVAL */ + "ltgr %5, %5\n\t" /* check child_stack, which is already in r2 */ + "jz 1f\n\t" /* NULL stack pointer, return -EINVAL */ + /* flags already in r3 */ + /* parent_tidptr already in r4 */ + /* child_tidptr already in r5 */ + /* newtls already in r6 */ + "svc %2\n\t" /* invoke clone syscall */ + "ltgr %0, %%r2\n\t" /* load return code into __ret and test */ + "jnz 1f\n\t" /* return to parent if non-zero */ + /* start child thread */ + "lgr %%r2, %7\n\t" /* set first parameter to void *arg */ + "aghi %%r15, -160\n\t" /* make room on the stack for the save area */ + "xc 0(8,%%r15), 0(%%r15)\n\t" + "basr %%r14, %4\n\t" /* jump to fn */ + "svc %3\n" /* invoke exit syscall */ + "1:\n" #endif - - /* Call _exit(r3) */ - "li 0, %5\n\t" - "sc\n\t" - - /* Return to parent */ - "1:\n" - "mfcr %1\n\t" - "mr %0, 3\n\t" - : "=r" (__ret), "=r" (__err) - : "0" (-1), "1" (EINVAL), - "i" (__NR_clone), "i" (__NR_exit), - "r" (__fn), "r" (__cstack), "r" (__flags), - "r" (__arg), "r" (__ptidptr), "r" (__newtls), - "r" (__ctidptr), "i"(-LSS_PPC_MINIMUM_FRAME_SIZE) - : "cr0", "cr1", "memory", "ctr", - "r0", "r29", "r27", "r28"); + : "=r" (__ret) + : "0" (-EINVAL), "i" (__NR_clone), "i" (__NR_exit), + "d" (__fn), "d" (__cstack), "d" (__flags), "d" (__arg), + "d" (__ptidptr), "d" (__newtls), "d" (__ctidptr) + : "cc", "r14", "memory" + ); } - LSS_RETURN(int, __ret, __err); + LSS_RETURN(int, __ret); } - #ifdef __PPC64__ - LSS_INLINE _syscall4(int, fadvise64, int, fd, loff_t, offset, loff_t, len, - int, advice) - #else - /* fadvise64 wrapper not yet implemented for 32-bit PowerPC. */ #endif - #endif /* defined (__PPC__) */ #define __NR__exit __NR_exit #define __NR__gettid __NR_gettid #define __NR__mremap __NR_mremap - LSS_INLINE _syscall1(int, brk, void *, e) - LSS_INLINE _syscall2(int, capset, - struct kernel_cap_user_header*, h, - struct kernel_cap_user_data*, d) - LSS_INLINE _syscall1(int, chdir, const char *,p) - LSS_INLINE _syscall1(int, chroot, const char *,p) LSS_INLINE _syscall1(int, close, int, f) - LSS_INLINE _syscall2(int, clock_getres, int, c, - struct kernel_timespec*, t) - LSS_INLINE _syscall2(int, clock_gettime, int, c, - struct kernel_timespec*, t) - LSS_INLINE _syscall1(int, dup, int, f) - LSS_INLINE _syscall2(int, dup2, int, s, - int, d) - LSS_INLINE _syscall3(int, execve, const char*, f, - const char*const*,a,const char*const*, e) LSS_INLINE _syscall1(int, _exit, int, e) - LSS_INLINE _syscall1(int, exit_group, int, e) +#if defined(__aarch64__) && defined (__ILP32__) + /* aarch64_ilp32 uses fcntl64 for sys_fcntl() */ + LSS_INLINE _syscall3_long(int, fcntl, fcntl64, int, f, + int, c, long, a) +#else LSS_INLINE _syscall3(int, fcntl, int, f, int, c, long, a) - LSS_INLINE _syscall0(pid_t, fork) +#endif +#if defined(__aarch64__) && defined (__ILP32__) + /* aarch64_ilp32 uses fstat64 for sys_fstat() */ + LSS_INLINE _syscall2_long(int, fstat, fstat64, int, f, + struct kernel_stat*, b) +#else LSS_INLINE _syscall2(int, fstat, int, f, struct kernel_stat*, b) - LSS_INLINE _syscall2(int, fstatfs, int, f, - struct kernel_statfs*, b) - LSS_INLINE _syscall2(int, ftruncate, int, f, - off_t, l) - LSS_INLINE _syscall4(int, futex, int*, a, +#endif + LSS_INLINE _syscall6(int, futex, int*, a, int, o, int, v, - struct kernel_timespec*, t) - LSS_INLINE _syscall3(int, getdents, int, f, - struct kernel_dirent*, d, int, c) - LSS_INLINE _syscall3(int, getdents64, int, f, - struct kernel_dirent64*, d, int, c) - LSS_INLINE _syscall0(gid_t, getegid) - LSS_INLINE _syscall0(uid_t, geteuid) - LSS_INLINE _syscall0(pid_t, getpgrp) + struct kernel_timespec*, t, + int*, a2, + int, v3) +#ifdef __NR_getdents64 + LSS_INLINE _syscall3(int, getdents64, int, f, + struct kernel_dirent64*, d, int, c) +#define KERNEL_DIRENT kernel_dirent64 +#define GETDENTS sys_getdents64 +#else + LSS_INLINE _syscall3(int, getdents, int, f, + struct kernel_dirent*, d, int, c) +#define KERNEL_DIRENT kernel_dirent +#define GETDENTS sys_getdents +#endif LSS_INLINE _syscall0(pid_t, getpid) LSS_INLINE _syscall0(pid_t, getppid) - LSS_INLINE _syscall2(int, getpriority, int, a, - int, b) - LSS_INLINE _syscall3(int, getresgid, gid_t *, r, - gid_t *, e, gid_t *, s) - LSS_INLINE _syscall3(int, getresuid, uid_t *, r, - uid_t *, e, uid_t *, s) - #ifndef __ARM_EABI__ - /* No available on ARM EABI Linux. */ - LSS_INLINE _syscall2(int, getrlimit, int, r, - struct kernel_rlimit*, l) - #endif - LSS_INLINE _syscall1(pid_t, getsid, pid_t, p) LSS_INLINE _syscall0(pid_t, _gettid) - LSS_INLINE _syscall2(int, gettimeofday, struct timeval *, v, - struct timezone *, z) - LSS_INLINE _syscall5(int, setxattr, const char *,p, - const char *, n, const void *,v, - size_t, s, int, f) - LSS_INLINE _syscall5(int, lsetxattr, const char *,p, - const char *, n, const void *,v, - size_t, s, int, f) - LSS_INLINE _syscall4(ssize_t, getxattr, const char *,p, - const char *, n, void *, v, size_t, s) - LSS_INLINE _syscall4(ssize_t, lgetxattr, const char *,p, - const char *, n, void *, v, size_t, s) - LSS_INLINE _syscall3(ssize_t, listxattr, const char *,p, - char *, l, size_t, s) - LSS_INLINE _syscall3(ssize_t, llistxattr, const char *,p, - char *, l, size_t, s) - LSS_INLINE _syscall3(int, ioctl, int, d, - int, r, void *, a) - LSS_INLINE _syscall2(int, ioprio_get, int, which, - int, who) - LSS_INLINE _syscall3(int, ioprio_set, int, which, - int, who, int, ioprio) LSS_INLINE _syscall2(int, kill, pid_t, p, int, s) - LSS_INLINE _syscall3(off_t, lseek, int, f, - off_t, o, int, w) + #if defined(__x86_64__) + /* Need to make sure off_t isn't truncated to 32-bits under x32. */ + LSS_INLINE off_t LSS_NAME(lseek)(int f, off_t o, int w) { + _LSS_BODY(3, off_t, lseek, off_t, LSS_SYSCALL_ARG(f), (uint64_t)(o), + LSS_SYSCALL_ARG(w)); + } + #elif defined(__aarch64__) && defined (__ILP32__) + /* aarch64_ilp32 uses llseek for sys_lseek() */ + LSS_INLINE _syscall3_long(off_t, lseek, llseek, int, f, + off_t, o, int, w) + #else + LSS_INLINE _syscall3(off_t, lseek, int, f, + off_t, o, int, w) + #endif LSS_INLINE _syscall2(int, munmap, void*, s, size_t, l) - LSS_INLINE _syscall6(long, move_pages, pid_t, p, - unsigned long, n, void **,g, int *, d, - int *, s, int, f) - LSS_INLINE _syscall3(int, mprotect, const void *,a, - size_t, l, int, p) LSS_INLINE _syscall5(void*, _mremap, void*, o, size_t, os, size_t, ns, unsigned long, f, void *, a) - LSS_INLINE _syscall3(int, open, const char*, p, - int, f, int, m) - LSS_INLINE _syscall3(int, poll, struct kernel_pollfd*, u, - unsigned int, n, int, t) LSS_INLINE _syscall2(int, prctl, int, o, long, a) - LSS_INLINE _syscall5(int, mount, const char *, source, const char *, target, - const char *, filesystemtype, unsigned long, mountflags, - const void *, data) - LSS_INLINE _syscall1(int, unshare, int, flags) - LSS_INLINE _syscall2(int, setns, int, fd, int, nstype) - #if defined(__NR_preadv) - // Defined on x86_64 / i386 only - LSS_INLINE _syscall5(ssize_t, preadv, unsigned long, fd, - const struct kernel_iovec*, iovec, - unsigned long, vlen, unsigned long, pos_l, - unsigned long, pos_h) - #endif LSS_INLINE _syscall4(long, ptrace, int, r, pid_t, p, void *, a, void *, d) - #if defined(__NR_pwritev) - // Defined on x86_64 / i386 only - LSS_INLINE _syscall5(ssize_t, pwritev, unsigned long, fd, - const struct kernel_iovec*, iovec, - unsigned long, vlen, unsigned long, pos_l, - unsigned long, pos_h) - #endif - #if defined(__NR_quotactl) - // Defined on x86_64 / i386 only - LSS_INLINE _syscall4(int, quotactl, int, cmd, const char *, special, - int, id, caddr_t, addr) - #endif LSS_INLINE _syscall3(ssize_t, read, int, f, void *, b, size_t, c) - LSS_INLINE _syscall3(int, readlink, const char*, p, - char*, b, size_t, s) LSS_INLINE _syscall4(int, rt_sigaction, int, s, const struct kernel_sigaction*, a, struct kernel_sigaction*, o, size_t, c) - LSS_INLINE _syscall2(int, rt_sigpending, struct kernel_sigset_t *, s, - size_t, c) LSS_INLINE _syscall4(int, rt_sigprocmask, int, h, const struct kernel_sigset_t*, s, struct kernel_sigset_t*, o, size_t, c); - LSS_INLINE _syscall1(int, rt_sigreturn, unsigned long, u); - LSS_INLINE _syscall2(int, rt_sigsuspend, - const struct kernel_sigset_t*, s, size_t, c); - LSS_INLINE _syscall3(int, sched_getaffinity,pid_t, p, - unsigned int, l, unsigned long *, m) - LSS_INLINE _syscall3(int, sched_setaffinity,pid_t, p, - unsigned int, l, unsigned long *, m) LSS_INLINE _syscall0(int, sched_yield) - LSS_INLINE _syscall1(long, set_tid_address, int *, t) - LSS_INLINE _syscall1(int, setfsgid, gid_t, g) - LSS_INLINE _syscall1(int, setfsuid, uid_t, u) - LSS_INLINE _syscall1(int, setuid, uid_t, u) - LSS_INLINE _syscall1(int, setgid, gid_t, g) - LSS_INLINE _syscall2(int, setpgid, pid_t, p, - pid_t, g) - LSS_INLINE _syscall3(int, setpriority, int, a, - int, b, int, p) - LSS_INLINE _syscall3(int, setresgid, gid_t, r, - gid_t, e, gid_t, s) - LSS_INLINE _syscall3(int, setresuid, uid_t, r, - uid_t, e, uid_t, s) - LSS_INLINE _syscall2(int, setrlimit, int, r, - const struct kernel_rlimit*, l) - LSS_INLINE _syscall0(pid_t, setsid) LSS_INLINE _syscall2(int, sigaltstack, const stack_t*, s, const stack_t*, o) - #if defined(__NR_sigreturn) - LSS_INLINE _syscall1(int, sigreturn, unsigned long, u); + #if defined(__NR_fstatat) + LSS_INLINE _syscall4(int, fstatat, int, d, const char *, p, + struct kernel_stat*, b, int, flags) + LSS_INLINE int LSS_NAME(stat)(const char* p, struct kernel_stat* b) { + return LSS_NAME(fstatat)(AT_FDCWD,p,b,0); + } + #else + LSS_INLINE _syscall2(int, stat, const char*, f, + struct kernel_stat*, b) #endif - LSS_INLINE _syscall2(int, stat, const char*, f, - struct kernel_stat*, b) - LSS_INLINE _syscall2(int, statfs, const char*, f, - struct kernel_statfs*, b) - LSS_INLINE _syscall3(int, tgkill, pid_t, p, - pid_t, t, int, s) - LSS_INLINE _syscall2(int, tkill, pid_t, p, - int, s) LSS_INLINE _syscall3(ssize_t, write, int, f, const void *, b, size_t, c) - LSS_INLINE _syscall3(ssize_t, writev, int, f, - const struct kernel_iovec*, v, size_t, c) - LSS_INLINE _syscall1(int, umask, unsigned, m) - LSS_INLINE _syscall1(int, unlink, const char*, f) #if defined(__NR_getcpu) LSS_INLINE _syscall3(long, getcpu, unsigned *, cpu, unsigned *, node, void *, unused); #endif - #if defined(__x86_64__) || \ + #if defined(__x86_64__) || defined(__aarch64__) || \ (defined(__mips__) && _MIPS_SIM != _MIPS_SIM_ABI32) - LSS_INLINE _syscall3(int, recvmsg, int, s, - struct kernel_msghdr*, m, int, f) - LSS_INLINE _syscall3(int, sendmsg, int, s, - const struct kernel_msghdr*, m, int, f) - LSS_INLINE _syscall6(int, sendto, int, s, - const void*, m, size_t, l, - int, f, - const struct kernel_sockaddr*, a, int, t) - LSS_INLINE _syscall2(int, shutdown, int, s, - int, h) LSS_INLINE _syscall3(int, socket, int, d, int, t, int, p) - LSS_INLINE _syscall4(int, socketpair, int, d, - int, t, int, p, int*, s) #endif - - #if defined(__x86_64__) || defined(__PPC__) - LSS_INLINE int LSS_NAME(getresgid32)(gid_t *rgid, - gid_t *egid, - gid_t *sgid) { - return LSS_NAME(getresgid)(rgid, egid, sgid); - } - - LSS_INLINE int LSS_NAME(getresuid32)(uid_t *ruid, - uid_t *euid, - uid_t *suid) { - return LSS_NAME(getresuid)(ruid, euid, suid); - } - - LSS_INLINE _syscall4(int, newfstatat, int, d, - const char *, p, - struct kernel_stat*, b, int, f) - - LSS_INLINE int LSS_NAME(setfsgid32)(gid_t gid) { - return LSS_NAME(setfsgid)(gid); - } - - LSS_INLINE int LSS_NAME(setfsuid32)(uid_t uid) { - return LSS_NAME(setfsuid)(uid); - } - - LSS_INLINE int LSS_NAME(setresgid32)(gid_t rgid, gid_t egid, gid_t sgid) { - return LSS_NAME(setresgid)(rgid, egid, sgid); - } - - LSS_INLINE int LSS_NAME(setresuid32)(uid_t ruid, uid_t euid, uid_t suid) { - return LSS_NAME(setresuid)(ruid, euid, suid); - } - #endif // defined(__x86_64__) || defined(__PPC__) - - #if defined(__x86_64__) || defined(__PPC64__) - LSS_INLINE _syscall4(int, fallocate, int, fd, int, mode, - loff_t, offset, loff_t, len) - - LSS_INLINE _syscall6(void*, mmap, void*, s, - size_t, l, int, p, - int, f, int, d, - __off64_t, o) - + #if defined(__x86_64__) || defined(__s390x__) LSS_INLINE int LSS_NAME(sigaction)(int signum, const struct kernel_sigaction *act, struct kernel_sigaction *oldact) { - #if defined(__x86_64__) + #if defined(__x86_64__) /* On x86_64, the kernel requires us to always set our own * SA_RESTORER in order to be able to return from a signal handler. * This function must have a "magic" signature that the "gdb" @@ -2938,17 +2574,10 @@ struct kernel_io_event { a.sa_restorer = LSS_NAME(restore_rt)(); return LSS_NAME(rt_sigaction)(signum, &a, oldact, (KERNEL_NSIG+7)/8); - } else { + } else + #endif return LSS_NAME(rt_sigaction)(signum, act, oldact, (KERNEL_NSIG+7)/8); - } - #else - return LSS_NAME(rt_sigaction)(signum, act, oldact, (KERNEL_NSIG+7)/8); - #endif - } - - LSS_INLINE int LSS_NAME(sigpending)(struct kernel_sigset_t *set) { - return LSS_NAME(rt_sigpending)(set, (KERNEL_NSIG+7)/8); } LSS_INLINE int LSS_NAME(sigprocmask)(int how, @@ -2956,178 +2585,41 @@ struct kernel_io_event { struct kernel_sigset_t *oldset) { return LSS_NAME(rt_sigprocmask)(how, set, oldset, (KERNEL_NSIG+7)/8); } + #endif + #if (defined(__aarch64__)) || \ + (defined(__mips__) \ + && (_MIPS_SIM == _MIPS_SIM_ABI64 || _MIPS_SIM == _MIPS_SIM_NABI32)) + LSS_INLINE int LSS_NAME(sigaction)(int signum, + const struct kernel_sigaction *act, + struct kernel_sigaction *oldact) { + return LSS_NAME(rt_sigaction)(signum, act, oldact, (KERNEL_NSIG+7)/8); - LSS_INLINE int LSS_NAME(sigsuspend)(const struct kernel_sigset_t *set) { - return LSS_NAME(rt_sigsuspend)(set, (KERNEL_NSIG+7)/8); } - #endif /* defined(__x86_64__) || defined(__PPC64__) */ - - #if defined(__x86_64__) || \ - defined(__arm__) || \ - (defined(__mips__) && _MIPS_SIM != _MIPS_SIM_ABI32) + LSS_INLINE int LSS_NAME(sigprocmask)(int how, + const struct kernel_sigset_t *set, + struct kernel_sigset_t *oldset) { + return LSS_NAME(rt_sigprocmask)(how, set, oldset, (KERNEL_NSIG+7)/8); + } + #endif + #ifdef __NR_wait4 LSS_INLINE _syscall4(pid_t, wait4, pid_t, p, int*, s, int, o, - struct kernel_rusage*, r) - + struct kernel_rusage*, r) LSS_INLINE pid_t LSS_NAME(waitpid)(pid_t pid, int *status, int options){ return LSS_NAME(wait4)(pid, status, options, 0); } + #else + LSS_INLINE _syscall3(pid_t, waitpid, pid_t, p, + int*, s, int, o) #endif - #if defined(__x86_64__)|| \ - defined(__ARM_ARCH_3__) || defined(__ARM_ARCH_5T__) || \ - defined(__mips__) || defined(__PPC__) - LSS_INLINE _syscall2(int, setgroups, size_t, c, - const gid_t *, g) - #endif - #if defined(__i386__) || defined(__x86_64__) || defined(__arm__) || \ - defined(__PPC__) + #ifdef __NR_openat LSS_INLINE _syscall4(int, openat, int, d, const char *, p, int, f, int, m) - LSS_INLINE _syscall3(int, unlinkat, int, d, const char *, p, int, f) - #endif - #if defined(__i386__) || defined(__arm__) - #define __NR__getresgid32 __NR_getresgid32 - #define __NR__getresuid32 __NR_getresuid32 - #define __NR__setfsgid32 __NR_setfsgid32 - #define __NR__setfsuid32 __NR_setfsuid32 - #define __NR__setgroups32 __NR_setgroups32 - #define __NR__setgroups __NR_setgroups - #define __NR__setresgid32 __NR_setresgid32 - #define __NR__setresuid32 __NR_setresuid32 - LSS_INLINE _syscall2(int, ugetrlimit, int, r, - struct kernel_rlimit*, l) - LSS_INLINE _syscall3(int, _getresgid32, gid_t *, r, - gid_t *, e, gid_t *, s) - LSS_INLINE _syscall3(int, _getresuid32, uid_t *, r, - uid_t *, e, uid_t *, s) - LSS_INLINE _syscall1(int, _setfsgid32, gid_t, f) - LSS_INLINE _syscall1(int, _setfsuid32, uid_t, f) - LSS_INLINE _syscall2(int, _setgroups32, int, s, - const unsigned int *, l) - LSS_INLINE _syscall2(int, _setgroups, size_t, c, - const unsigned short *, g) - LSS_INLINE _syscall3(int, _setresgid32, gid_t, r, - gid_t, e, gid_t, s) - LSS_INLINE _syscall3(int, _setresuid32, uid_t, r, - uid_t, e, uid_t, s) - - LSS_INLINE int LSS_NAME(getresgid32)(gid_t *rgid, - gid_t *egid, - gid_t *sgid) { - int rc; - if ((rc = LSS_NAME(_getresgid32)(rgid, egid, sgid)) < 0 && - LSS_ERRNO == ENOSYS) { - if ((rgid == NULL) || (egid == NULL) || (sgid == NULL)) { - return EFAULT; - } - // Clear the high bits first, since getresgid only sets 16 bits - *rgid = *egid = *sgid = 0; - rc = LSS_NAME(getresgid)(rgid, egid, sgid); - } - return rc; - } - - LSS_INLINE int LSS_NAME(getresuid32)(uid_t *ruid, - uid_t *euid, - uid_t *suid) { - int rc; - if ((rc = LSS_NAME(_getresuid32)(ruid, euid, suid)) < 0 && - LSS_ERRNO == ENOSYS) { - if ((ruid == NULL) || (euid == NULL) || (suid == NULL)) { - return EFAULT; - } - // Clear the high bits first, since getresuid only sets 16 bits - *ruid = *euid = *suid = 0; - rc = LSS_NAME(getresuid)(ruid, euid, suid); - } - return rc; - } - - LSS_INLINE int LSS_NAME(setfsgid32)(gid_t gid) { - int rc; - if ((rc = LSS_NAME(_setfsgid32)(gid)) < 0 && - LSS_ERRNO == ENOSYS) { - if ((unsigned int)gid & ~0xFFFFu) { - LSS_ERRNO = EINVAL; - } else { - rc = LSS_NAME(setfsgid)(gid); - } - } - return rc; - } - - LSS_INLINE int LSS_NAME(setfsuid32)(uid_t uid) { - int rc; - if ((rc = LSS_NAME(_setfsuid32)(uid)) < 0 && - LSS_ERRNO == ENOSYS) { - if ((unsigned int)uid & ~0xFFFFu) { - LSS_ERRNO = EINVAL; - } else { - rc = LSS_NAME(setfsuid)(uid); - } - } - return rc; - } - - - // We cannot allocate memory so there is a problem with building the - // list of groups with the proper datatype. Older kernels have limits - // on the number of groups that can be set at one time of up to 32. - // So we have an array on the stack of size 32 where to put the groups. - #define LSS_SET_GROUPS_SIZE 32 - LSS_INLINE int LSS_NAME(setgroups)(size_t size, const unsigned int *list) { - int rc = 0; - if ((rc = LSS_NAME(_setgroups32)(size, list)) < 0 && - LSS_ERRNO == ENOSYS) { - if (size > LSS_SET_GROUPS_SIZE) { - LSS_ERRNO = EINVAL; - } else { - unsigned short gid_list[LSS_SET_GROUPS_SIZE]; - int i; - for (i = 0; i < size; ++i) { - if (list[i] & ~0xFFFFu) { - LSS_ERRNO = EINVAL; - break; - } - gid_list[i] = list[i]; - } - if (LSS_ERRNO != EINVAL) { - rc = LSS_NAME(_setgroups)(size, gid_list); - } - } - } - return rc; - } - #undef LSS_SET_GROUPS_SIZE - - LSS_INLINE int LSS_NAME(setresgid32)(gid_t rgid, gid_t egid, gid_t sgid) { - int rc; - if ((rc = LSS_NAME(_setresgid32)(rgid, egid, sgid)) < 0 && - LSS_ERRNO == ENOSYS) { - if ((unsigned int)rgid & ~0xFFFFu || - (unsigned int)egid & ~0xFFFFu || - (unsigned int)sgid & ~0xFFFFu) { - LSS_ERRNO = EINVAL; - } else { - rc = LSS_NAME(setresgid)(rgid, egid, sgid); - } - } - return rc; - } - - LSS_INLINE int LSS_NAME(setresuid32)(uid_t ruid, uid_t euid, uid_t suid) { - int rc; - if ((rc = LSS_NAME(_setresuid32)(ruid, euid, suid)) < 0 && - LSS_ERRNO == ENOSYS) { - if ((unsigned int)ruid & ~0xFFFFu || - (unsigned int)euid & ~0xFFFFu || - (unsigned int)suid & ~0xFFFFu) { - LSS_ERRNO = EINVAL; - } else { - rc = LSS_NAME(setresuid)(ruid, euid, suid); - } - } - return rc; + LSS_INLINE int LSS_NAME(open)(const char* p, int f, int m) { + return LSS_NAME(openat)(AT_FDCWD,p,f,m ); } + #else + LSS_INLINE _syscall3(int, open, const char*, p, + int, f, int, m) #endif LSS_INLINE int LSS_NAME(sigemptyset)(struct kernel_sigset_t *set) { memset(&set->sig, 0, sizeof(set->sig)); @@ -3163,51 +2655,40 @@ struct kernel_io_event { } } - LSS_INLINE int LSS_NAME(sigismember)(struct kernel_sigset_t *set, - int signum) { - if (signum < 1 || signum > (int)(8*sizeof(set->sig))) { - LSS_ERRNO = EINVAL; - return -1; - } else { - return !!(set->sig[(signum - 1)/(8*sizeof(set->sig[0]))] & - (1UL << ((signum - 1) % (8*sizeof(set->sig[0]))))); - } - } - #if defined(__i386__) || \ - defined(__arm__) || \ - (defined(__mips__) && _MIPS_SIM == _MIPS_SIM_ABI32) || \ - (defined(__PPC__) && !defined(__PPC64__)) + #if defined(__i386__) || \ + defined(__arm__) || \ + (defined(__mips__) && _MIPS_SIM == _MIPS_SIM_ABI32) || \ + defined(__PPC__) || \ + (defined(__s390__) && !defined(__s390x__)) #define __NR__sigaction __NR_sigaction - #define __NR__sigpending __NR_sigpending #define __NR__sigprocmask __NR_sigprocmask - #define __NR__sigsuspend __NR_sigsuspend LSS_INLINE _syscall2(int, fstat64, int, f, struct kernel_stat64 *, b) LSS_INLINE _syscall5(int, _llseek, uint, fd, ulong, hi, ulong, lo, loff_t *, res, uint, wh) - - #ifndef __ARM_EABI__ - /* Not available on ARM EABI Linux. */ - LSS_INLINE _syscall1(void*, mmap, void*, a) - #endif - LSS_INLINE _syscall6(void*, mmap2, void*, s, +#if defined(__s390__) && !defined(__s390x__) + /* On s390, mmap2() arguments are passed in memory. */ + LSS_INLINE void* LSS_NAME(_mmap2)(void *s, size_t l, int p, int f, int d, + off_t o) { + unsigned long buf[6] = { (unsigned long) s, (unsigned long) l, + (unsigned long) p, (unsigned long) f, + (unsigned long) d, (unsigned long) o }; + LSS_REG(2, buf); + LSS_BODY(void*, mmap2, "0"(__r2)); + } +#elif !defined(__PPC64__) + #define __NR__mmap2 __NR_mmap2 + LSS_INLINE _syscall6(void*, _mmap2, void*, s, size_t, l, int, p, int, f, int, d, off_t, o) +#endif LSS_INLINE _syscall3(int, _sigaction, int, s, const struct kernel_old_sigaction*, a, struct kernel_old_sigaction*, o) - LSS_INLINE _syscall1(int, _sigpending, unsigned long*, s) LSS_INLINE _syscall3(int, _sigprocmask, int, h, const unsigned long*, s, unsigned long*, o) - #ifdef __PPC__ - LSS_INLINE _syscall1(int, _sigsuspend, unsigned long, s) - #else - LSS_INLINE _syscall3(int, _sigsuspend, const void*, a, - int, b, - unsigned long, s) - #endif LSS_INLINE _syscall2(int, stat64, const char *, p, struct kernel_stat64 *, b) @@ -3273,17 +2754,6 @@ struct kernel_io_event { return rc; } - LSS_INLINE int LSS_NAME(sigpending)(struct kernel_sigset_t *set) { - int old_errno = LSS_ERRNO; - int rc = LSS_NAME(rt_sigpending)(set, (KERNEL_NSIG+7)/8); - if (rc < 0 && LSS_ERRNO == ENOSYS) { - LSS_ERRNO = old_errno; - LSS_NAME(sigemptyset)(set); - rc = LSS_NAME(_sigpending)(&set->sig[0]); - } - return rc; - } - LSS_INLINE int LSS_NAME(sigprocmask)(int how, const struct kernel_sigset_t *set, struct kernel_sigset_t *oldset) { @@ -3300,161 +2770,59 @@ struct kernel_io_event { } return rc; } - - LSS_INLINE int LSS_NAME(sigsuspend)(const struct kernel_sigset_t *set) { - int olderrno = LSS_ERRNO; - int rc = LSS_NAME(rt_sigsuspend)(set, (KERNEL_NSIG+7)/8); - if (rc < 0 && LSS_ERRNO == ENOSYS) { - LSS_ERRNO = olderrno; - rc = LSS_NAME(_sigsuspend)( - #ifndef __PPC__ - set, 0, - #endif - set->sig[0]); - } - return rc; - } #endif - #if defined(__PPC__) - #undef LSS_SC_LOADARGS_0 - #define LSS_SC_LOADARGS_0(dummy...) - /* arg1 .. arg6 are passed in an unsigned long array pointed by r4. */ - #undef LSS_SC_LOADARGS_1 - #define LSS_SC_LOADARGS_1(arg1) \ - sc_args[0] = (unsigned long) (arg1) - #undef LSS_SC_LOADARGS_2 - #define LSS_SC_LOADARGS_2(arg1, arg2) \ - LSS_SC_LOADARGS_1(arg1); \ - sc_args[1] = (unsigned long) (arg2) - #undef LSS_SC_LOADARGS_3 - #define LSS_SC_LOADARGS_3(arg1, arg2, arg3) \ - LSS_SC_LOADARGS_2(arg1, arg2); \ - sc_args[2] = (unsigned long) (arg3) - #undef LSS_SC_LOADARGS_4 - #define LSS_SC_LOADARGS_4(arg1, arg2, arg3, arg4) \ - LSS_SC_LOADARGS_3(arg1, arg2, arg3); \ - sc_args[3] = (unsigned long) (arg4) - #undef LSS_SC_LOADARGS_5 - #define LSS_SC_LOADARGS_5(arg1, arg2, arg3, arg4, arg5) \ - LSS_SC_LOADARGS_4(arg1, arg2, arg3, arg4); \ - sc_args[4] = (unsigned long) (arg5) - #undef LSS_SC_LOADARGS_6 - #define LSS_SC_LOADARGS_6(arg1, arg2, arg3, arg4, arg5, arg6) \ - LSS_SC_LOADARGS_5(arg1, arg2, arg3, arg4, arg5); \ - sc_args[5] = (unsigned long) (arg6) - #undef LSS_SC_BODY - /* - * Do a socket system call using the generic socketcall() interface. - * We pack arguments into an array of unsigned longs and then - * call socketcall() with a function number and the argument array. - * Although some socket calls now have their own syscall numbers, - * we still use socketcall() to make our code work with older kernels. - */ - #define LSS_SC_BODY(nr, type, opt, args...) \ - long __sc_ret, __sc_err; \ - { \ - unsigned long sc_args[6]; \ - register unsigned long __sc_0 __asm__ ("r0") = __NR_socketcall; \ - register unsigned long __sc_3 __asm__ ("r3") = opt; \ - register unsigned long __sc_4 __asm__ ("r4"); \ - LSS_SC_LOADARGS_##nr(args); \ - __asm__ __volatile__ \ - ("sc\n\t" \ - "mfcr %0" \ - : "+r" (__sc_0), \ - "+r" (__sc_3), "=r" (__sc_4) \ - : "2"(&sc_args) \ - : "r5", "r6", "r7", "r8", "r9", "r10", "r11", "r12", \ - "cr0", "ctr", "memory"); \ - __sc_ret = __sc_3; \ - __sc_err = __sc_0; \ - } \ - LSS_RETURN(type, __sc_ret, __sc_err) - - LSS_INLINE ssize_t LSS_NAME(recvmsg)(int s,struct kernel_msghdr *msg, - int flags){ - LSS_SC_BODY(3, ssize_t, 17, s, msg, flags); - } - - LSS_INLINE ssize_t LSS_NAME(sendmsg)(int s, - const struct kernel_msghdr *msg, - int flags) { - LSS_SC_BODY(3, ssize_t, 16, s, msg, flags); - } - - LSS_INLINE ssize_t LSS_NAME(sendto)(int s, const void *buf, size_t len, - int flags, - const struct kernel_sockaddr *to, - unsigned int tolen) { - LSS_SC_BODY(6, ssize_t, 11, s, buf, len, flags, to, tolen); - } - - LSS_INLINE int LSS_NAME(shutdown)(int s, int how) { - LSS_SC_BODY(2, int, 13, s, how); + #if defined(__i386__) || \ + defined(__ARM_ARCH_3__) || defined(__ARM_EABI__) || \ + (defined(__mips__) && _MIPS_SIM == _MIPS_SIM_ABI32) || \ + (defined(__PPC__) && !defined(__PPC64__)) || \ + (defined(__s390__) && !defined(__s390x__)) + /* On these architectures, implement mmap() with mmap2(). */ + LSS_INLINE void* LSS_NAME(mmap)(void *s, size_t l, int p, int f, int d, + int64_t o) { + if (o % 4096) { + LSS_ERRNO = EINVAL; + return (void *) -1; + } + return LSS_NAME(_mmap2)(s, l, p, f, d, (o / 4096)); } - - LSS_INLINE int LSS_NAME(socket)(int domain, int type, int protocol) { - LSS_SC_BODY(3, int, 1, domain, type, protocol); + #elif defined(__s390x__) + /* On s390x, mmap() arguments are passed in memory. */ + LSS_INLINE void* LSS_NAME(mmap)(void *s, size_t l, int p, int f, int d, + int64_t o) { + unsigned long buf[6] = { (unsigned long) s, (unsigned long) l, + (unsigned long) p, (unsigned long) f, + (unsigned long) d, (unsigned long) o }; + LSS_REG(2, buf); + LSS_BODY(void*, mmap, "0"(__r2)); } - - LSS_INLINE int LSS_NAME(socketpair)(int d, int type, int protocol, - int sv[2]) { - LSS_SC_BODY(4, int, 8, d, type, protocol, sv); + #elif defined(__x86_64__) + /* Need to make sure __off64_t isn't truncated to 32-bits under x32. */ + LSS_INLINE void* LSS_NAME(mmap)(void *s, size_t l, int p, int f, int d, + int64_t o) { + LSS_BODY(6, void*, mmap, LSS_SYSCALL_ARG(s), LSS_SYSCALL_ARG(l), + LSS_SYSCALL_ARG(p), LSS_SYSCALL_ARG(f), + LSS_SYSCALL_ARG(d), (uint64_t)(o)); } + #elif defined(__aarch64__) && defined (__ILP32__) + /* aarch64_ilp32 uses mmap2 for sys_mmap() */ + LSS_INLINE _syscall6_long(void*, mmap, mmap2, void*, addr, size_t, length, + int, prot, int, flags, int, fd, int64_t, offset) + #else + /* Remaining 64-bit architectures. */ + LSS_INLINE _syscall6(void*, mmap, void*, addr, size_t, length, int, prot, + int, flags, int, fd, int64_t, offset) #endif #if defined(__i386__) || \ + defined(__PPC__) || \ (defined(__arm__) && !defined(__ARM_EABI__)) || \ - (defined(__mips__) && _MIPS_SIM == _MIPS_SIM_ABI32) + (defined(__mips__) && _MIPS_SIM == _MIPS_SIM_ABI32) || \ + defined(__s390__) /* See sys_socketcall in net/socket.c in kernel source. * It de-multiplexes on its first arg and unpacks the arglist * array in its second arg. */ - LSS_INLINE _syscall2(long, socketcall, int, c, unsigned long*, a) - - LSS_INLINE ssize_t LSS_NAME(recvmsg)(int s,struct kernel_msghdr *msg, - int flags){ - unsigned long args[3] = { - (unsigned long) s, - (unsigned long) msg, - (unsigned long) flags - }; - return (ssize_t) LSS_NAME(socketcall)(17, args); - } - - LSS_INLINE ssize_t LSS_NAME(sendmsg)(int s, - const struct kernel_msghdr *msg, - int flags) { - unsigned long args[3] = { - (unsigned long) s, - (unsigned long) msg, - (unsigned long) flags - }; - return (ssize_t) LSS_NAME(socketcall)(16, args); - } - - LSS_INLINE ssize_t LSS_NAME(sendto)(int s, const void *buf, size_t len, - int flags, - const struct kernel_sockaddr *to, - unsigned int tolen) { - unsigned long args[6] = { - (unsigned long) s, - (unsigned long) buf, - (unsigned long) len, - (unsigned long) flags, - (unsigned long) to, - (unsigned long) tolen - }; - return (ssize_t) LSS_NAME(socketcall)(11, args); - } - - LSS_INLINE int LSS_NAME(shutdown)(int s, int how) { - unsigned long args[2] = { - (unsigned long) s, - (unsigned long) how - }; - return LSS_NAME(socketcall)(13, args); - } + LSS_INLINE _syscall2(int, socketcall, int, c, unsigned long*, a) LSS_INLINE int LSS_NAME(socket)(int domain, int type, int protocol) { unsigned long args[3] = { @@ -3464,45 +2832,9 @@ struct kernel_io_event { }; return LSS_NAME(socketcall)(1, args); } - - LSS_INLINE int LSS_NAME(socketpair)(int d, int type, int protocol, - int sv[2]) { - unsigned long args[4] = { - (unsigned long) d, - (unsigned long) type, - (unsigned long) protocol, - (unsigned long) sv - }; - return LSS_NAME(socketcall)(8, args); - } #elif defined(__ARM_EABI__) - /* ARM EABI Linix does not have socketcall. */ - LSS_INLINE _syscall3(ssize_t, recvmsg, int, s, - struct kernel_msghdr*, m, int, f) - LSS_INLINE _syscall3(ssize_t, sendmsg, int, s, - const struct kernel_msghdr*, m, int, f) - LSS_INLINE _syscall6(ssize_t, sendto, int, s, - const void*, b, size_t, l, - int, f, - const struct kernel_sockaddr*, to, - unsigned int, tl) - LSS_INLINE _syscall2(int, shutdown, int, s, - int, h) LSS_INLINE _syscall3(int, socket, int, d, int, t, int, p) - LSS_INLINE _syscall4(int, socketpair, int, d, - int, t, int, p, int*, s) - #endif - #if defined(__i386__) || (defined(__PPC__) && !defined(__PPC64__)) || \ - defined(__arm__) - LSS_INLINE _syscall4(int, fstatat64, int, d, - const char *, p, - struct kernel_stat64 *, b, int, f) - #endif - #if defined(__i386__) || defined(__PPC__) || \ - (defined(__mips__) && _MIPS_SIM == _MIPS_SIM_ABI32) - LSS_INLINE _syscall3(pid_t, waitpid, pid_t, p, - int*, s, int, o) #endif #if defined(__mips__) /* sys_pipe() on MIPS has non-standard calling conventions, as it returns @@ -3526,32 +2858,15 @@ struct kernel_io_event { return 0; } } + #elif defined(__NR_pipe2) + LSS_INLINE _syscall2(int, pipe2, int *, p, + int, f ) + LSS_INLINE int LSS_NAME(pipe)( int * p) { + return LSS_NAME(pipe2)(p, 0); + } #else LSS_INLINE _syscall1(int, pipe, int *, p) #endif - /* TODO(user): see if ppc can/should support this as well */ - #if defined(__i386__) || \ - defined(__arm__) || \ - (defined(__mips__) && _MIPS_SIM != _MIPS_SIM_ABI64) - #define __NR__statfs64 __NR_statfs64 - #define __NR__fstatfs64 __NR_fstatfs64 - LSS_INLINE _syscall3(int, _statfs64, const char*, p, - size_t, s,struct kernel_statfs64*, b) - LSS_INLINE _syscall3(int, _fstatfs64, int, f, - size_t, s,struct kernel_statfs64*, b) - LSS_INLINE int LSS_NAME(statfs64)(const char *p, - struct kernel_statfs64 *b) { - return LSS_NAME(_statfs64)(p, sizeof(*b), b); - } - LSS_INLINE int LSS_NAME(fstatfs64)(int f,struct kernel_statfs64 *b) { - return LSS_NAME(_fstatfs64)(f, sizeof(*b), b); - } - #endif - - LSS_INLINE int LSS_NAME(execv)(const char *path, const char *const argv[]) { - extern char **environ; - return LSS_NAME(execve)(path, argv, (const char *const *)environ); - } LSS_INLINE pid_t LSS_NAME(gettid)() { pid_t tid = LSS_NAME(_gettid)(); @@ -3574,102 +2889,20 @@ struct kernel_io_event { } LSS_INLINE int LSS_NAME(ptrace_detach)(pid_t pid) { - return LSS_NAME(ptrace)(PTRACE_DETACH, pid, (void *)0, (void *)0); - } - - LSS_INLINE int LSS_NAME(raise)(int sig) { - return LSS_NAME(kill)(LSS_NAME(getpid)(), sig); - } - - LSS_INLINE int LSS_NAME(setpgrp)() { - return LSS_NAME(setpgid)(0, 0); - } - - LSS_INLINE int LSS_NAME(sysconf)(int name) { - extern int __getpagesize(void); - switch (name) { - case _SC_OPEN_MAX: { - struct kernel_rlimit limit; - - /* On some systems getrlimit is obsolete, use ugetrlimit instead. */ - #ifndef __NR_getrlimit - return LSS_NAME(ugetrlimit)(RLIMIT_NOFILE, &limit) < 0 - ? 8192 : limit.rlim_cur; - #else - return LSS_NAME(getrlimit)(RLIMIT_NOFILE, &limit) < 0 - ? 8192 : limit.rlim_cur; - #endif - } - case _SC_PAGESIZE: - return __getpagesize(); - default: - LSS_ERRNO = ENOSYS; - return -1; - } - } - #if defined(__x86_64__) || \ - (defined(__mips__) && _MIPS_SIM == _MIPS_SIM_ABI64) - LSS_INLINE _syscall4(ssize_t, pread64, int, f, - void *, b, size_t, c, - loff_t, o) - LSS_INLINE _syscall4(ssize_t, pwrite64, int, f, - const void *, b, size_t, c, - loff_t, o) - LSS_INLINE _syscall3(int, readahead, int, f, - loff_t, o, unsigned, c) - #else - #define __NR__pread64 __NR_pread64 - #define __NR__pwrite64 __NR_pwrite64 - #define __NR__readahead __NR_readahead - LSS_INLINE _syscall5(ssize_t, _pread64, int, f, - void *, b, size_t, c, unsigned, o1, - unsigned, o2) - LSS_INLINE _syscall5(ssize_t, _pwrite64, int, f, - const void *, b, size_t, c, unsigned, o1, - long, o2) - LSS_INLINE _syscall4(int, _readahead, int, f, - unsigned, o1, unsigned, o2, size_t, c); - /* We force 64bit-wide parameters onto the stack, then access each - * 32-bit component individually. This guarantees that we build the - * correct parameters independent of the native byte-order of the - * underlying architecture. + /* PTRACE_DETACH can sometimes forget to wake up the tracee and it + * then sends job control signals to the real parent, rather than to + * the tracer. We reduce the risk of this happening by starting a + * whole new time slice, and then quickly sending a SIGCONT signal + * right after detaching from the tracee. */ - LSS_INLINE ssize_t LSS_NAME(pread64)(int fd, void *buf, size_t count, - loff_t off) { - union { loff_t off; unsigned arg[2]; } o = { off }; - return LSS_NAME(_pread64)(fd, buf, count, o.arg[0], o.arg[1]); - } - LSS_INLINE ssize_t LSS_NAME(pwrite64)(int fd, const void *buf, - size_t count, loff_t off) { - union { loff_t off; unsigned arg[2]; } o = { off }; - return LSS_NAME(_pwrite64)(fd, buf, count, o.arg[0], o.arg[1]); - } - LSS_INLINE int LSS_NAME(readahead)(int fd, loff_t off, int len) { - union { loff_t off; unsigned arg[2]; } o = { off }; - return LSS_NAME(_readahead)(fd, o.arg[0], o.arg[1], len); - } - #endif - #if defined(__NR_io_setup) - LSS_INLINE _syscall2(int, io_setup, - int, maxevents, - unsigned long *, ctxp); - LSS_INLINE _syscall3(int, io_submit, - unsigned long, ctx_id, - long, nr, - struct kernel_iocb **, ios); - LSS_INLINE _syscall5(int, io_getevents, - unsigned long, ctx_id, - long, min_nr, - long, nr, - struct kernel_io_event *, events, - struct kernel_timespec*, timeout); - LSS_INLINE _syscall1(int, io_destroy, - unsigned long, ctx); - LSS_INLINE _syscall3(int, io_cancel, - unsigned long, ctx_id, - struct kernel_iocb*, iocb, - struct kernel_io_event*, result); - #endif + int rc, err; + LSS_NAME(sched_yield)(); + rc = LSS_NAME(ptrace)(PTRACE_DETACH, pid, (void *)0, (void *)0); + err = LSS_ERRNO; + LSS_NAME(kill)(pid, SIGCONT); + LSS_ERRNO = err; + return rc; + } #endif #if defined(__cplusplus) && !defined(SYS_CPLUSPLUS) diff --git a/be/src/gutil/spinlock_linux-inl.h b/be/src/gutil/spinlock_linux-inl.h index c9838e49a24715..042ff8e21d6a16 100644 --- a/be/src/gutil/spinlock_linux-inl.h +++ b/be/src/gutil/spinlock_linux-inl.h @@ -51,15 +51,10 @@ static struct InitModule { int x = 0; // futexes are ints, so we can use them only when // that's the same size as the lockword_ in SpinLock. -#ifdef __arm__ - // ARM linux doesn't support sys_futex1(void*, int, int, struct timespec*); - have_futex = 0; -#else have_futex = (sizeof (Atomic32) == sizeof (int) && - sys_futex(&x, FUTEX_WAKE, 1, 0) >= 0); -#endif + sys_futex(&x, FUTEX_WAKE, 1, NULL, NULL, 0) >= 0); if (have_futex && - sys_futex(&x, FUTEX_WAKE | futex_private_flag, 1, 0) < 0) { + sys_futex(&x, FUTEX_WAKE | futex_private_flag, 1, NULL, NULL, 0) < 0) { futex_private_flag = 0; } } @@ -85,7 +80,8 @@ void SpinLockDelay(volatile Atomic32 *w, int32 value, int loop) { tm.tv_nsec *= 16; // increase the delay; we expect explicit wakeups sys_futex(reinterpret_cast(const_cast(w)), FUTEX_WAIT | futex_private_flag, - value, reinterpret_cast(&tm)); + value, reinterpret_cast(&tm), + NULL, 0); } else { nanosleep(&tm, NULL); } @@ -96,7 +92,8 @@ void SpinLockDelay(volatile Atomic32 *w, int32 value, int loop) { void SpinLockWake(volatile Atomic32 *w, bool all) { if (have_futex) { sys_futex(reinterpret_cast(const_cast(w)), - FUTEX_WAKE | futex_private_flag, all? INT_MAX : 1, 0); + FUTEX_WAKE | futex_private_flag, all? INT_MAX : 1, + NULL, NULL, 0); } } diff --git a/be/src/olap/olap_snapshot_converter.cpp b/be/src/olap/olap_snapshot_converter.cpp index 23cf4fd5cbb802..bd8d308747e7b8 100755 --- a/be/src/olap/olap_snapshot_converter.cpp +++ b/be/src/olap/olap_snapshot_converter.cpp @@ -335,6 +335,9 @@ OLAPStatus OlapSnapshotConverter::to_column_pb(const ColumnMessage& column_msg, if (column_msg.has_is_bf_column()) { column_pb->set_is_bf_column(column_msg.is_bf_column()); } + if (column_msg.has_has_bitmap_index()) { + column_pb->set_has_bitmap_index(column_msg.has_bitmap_index()); + } // TODO(ygl) calculate column id from column list // column_pb->set_referenced_column_id(column_msg.()); @@ -388,6 +391,9 @@ OLAPStatus OlapSnapshotConverter::to_column_msg(const ColumnPB& column_pb, Colum if (column_pb.has_is_bf_column()) { column_msg->set_is_bf_column(column_pb.is_bf_column()); } + if (column_pb.has_has_bitmap_index()) { + column_msg->set_has_bitmap_index(column_pb.has_bitmap_index()); + } column_msg->set_is_root_column(true); return OLAP_SUCCESS; } diff --git a/be/src/olap/rowset/segment_v2/segment_iterator.cpp b/be/src/olap/rowset/segment_v2/segment_iterator.cpp index 17cde912c05feb..551afc399d2a0e 100644 --- a/be/src/olap/rowset/segment_v2/segment_iterator.cpp +++ b/be/src/olap/rowset/segment_v2/segment_iterator.cpp @@ -520,6 +520,7 @@ Status SegmentIterator::next_batch(RowBlockV2* block) { column_predicate->evaluate(&column_block, block->selection_vector(), &selected_size); } block->set_selected_size(selected_size); + block->set_num_rows(selected_size); _opts.stats->rows_vec_cond_filtered += original_size - selected_size; } diff --git a/be/src/olap/rowset/segment_v2/segment_writer.cpp b/be/src/olap/rowset/segment_v2/segment_writer.cpp index 8e5dd0403ec967..5aeddb34d3f51d 100644 --- a/be/src/olap/rowset/segment_v2/segment_writer.cpp +++ b/be/src/olap/rowset/segment_v2/segment_writer.cpp @@ -62,10 +62,6 @@ Status SegmentWriter::init(uint32_t write_mbytes_per_sec) { if (column.is_key()) { opts.need_zone_map = true; } - // TODO set opts.need_bitmap_index based on table properties - if (!column.is_key()) { - opts.need_bitmap_index = _opts.need_bitmap_index; - } if (column.is_bf_column()) { opts.need_bloom_filter = true; if ((column.aggregation() == OLAP_FIELD_AGGREGATION_REPLACE @@ -77,6 +73,17 @@ Status SegmentWriter::init(uint32_t write_mbytes_per_sec) { opts.need_bloom_filter = false; } } + if (column.has_bitmap_index()) { + opts.need_bitmap_index = true; + if ((column.aggregation() == OLAP_FIELD_AGGREGATION_REPLACE + || column.aggregation() == OLAP_FIELD_AGGREGATION_REPLACE_IF_NOT_NULL) + && !_opts.whether_to_filter_value) { + // if the column's Aggregation type is OLAP_FIELD_AGGREGATION_REPLACE or + // OLAP_FIELD_AGGREGATION_REPLACE_IF_NOT_NULL and the segment is not in base rowset, + // do not write the bitmap index because it is useless + opts.need_bitmap_index = false; + } + } std::unique_ptr field(FieldFactory::create(column)); DCHECK(field.get() != nullptr); diff --git a/be/src/olap/rowset/segment_v2/segment_writer.h b/be/src/olap/rowset/segment_v2/segment_writer.h index ee607730d3a008..33988b34fe50be 100644 --- a/be/src/olap/rowset/segment_v2/segment_writer.h +++ b/be/src/olap/rowset/segment_v2/segment_writer.h @@ -44,8 +44,6 @@ extern const uint32_t k_segment_magic_length; struct SegmentWriterOptions { uint32_t num_rows_per_block = 1024; - // Todo(kks): only for UT, we should remove it when we support bitmap_index in FE - bool need_bitmap_index = false; // whether to filter value column against bloom filter/zone map bool whether_to_filter_value = false; }; diff --git a/be/src/olap/schema_change.cpp b/be/src/olap/schema_change.cpp index 8b6ba7b0d8f253..66fcce0f066e80 100644 --- a/be/src/olap/schema_change.cpp +++ b/be/src/olap/schema_change.cpp @@ -1931,6 +1931,10 @@ OLAPStatus SchemaChangeHandler::_parse_request(TabletSharedPtr base_tablet, != ref_tablet_schema.column(column_mapping->ref_column).is_bf_column()) { *sc_directly = true; return OLAP_SUCCESS; + } else if (new_tablet_schema.column(i).has_bitmap_index() + != ref_tablet_schema.column(column_mapping->ref_column).has_bitmap_index()) { + *sc_directly = true; + return OLAP_SUCCESS; } } } diff --git a/be/src/olap/tablet_manager.cpp b/be/src/olap/tablet_manager.cpp index 121fb462964518..2ba6e219270191 100644 --- a/be/src/olap/tablet_manager.cpp +++ b/be/src/olap/tablet_manager.cpp @@ -1198,7 +1198,17 @@ OLAPStatus TabletManager::_create_inital_rowset_unlocked( context.tablet_id = tablet->tablet_id(); context.partition_id = tablet->partition_id(); context.tablet_schema_hash = tablet->schema_hash(); - context.rowset_type = StorageEngine::instance()->default_rowset_type(); + if (!request.__isset.storage_format || request.storage_format == TStorageFormat::DEFAULT) { + context.rowset_type = StorageEngine::instance()->default_rowset_type(); + } else if (request.storage_format == TStorageFormat::V1){ + context.rowset_type = RowsetTypePB::ALPHA_ROWSET; + } else if (request.storage_format == TStorageFormat::V2) { + context.rowset_type = RowsetTypePB::BETA_ROWSET; + } else { + LOG(ERROR) << "invalid TStorageFormat: " << request.storage_format; + DCHECK(false); + context.rowset_type = StorageEngine::instance()->default_rowset_type(); + } context.rowset_path_prefix = tablet->tablet_path(); context.tablet_schema = &(tablet->tablet_schema()); context.rowset_state = VISIBLE; diff --git a/be/src/olap/tablet_meta.cpp b/be/src/olap/tablet_meta.cpp index b61d579c4772f2..a29b6881de1083 100755 --- a/be/src/olap/tablet_meta.cpp +++ b/be/src/olap/tablet_meta.cpp @@ -18,6 +18,7 @@ #include "olap/tablet_meta.h" #include +#include #include "olap/file_helper.h" #include "olap/olap_common.h" @@ -116,6 +117,7 @@ TabletMeta::TabletMeta(int64_t table_id, int64_t partition_id, uint32_t unique_id = col_ordinal_to_unique_id.at(col_ordinal++); column->set_unique_id(unique_id); column->set_name(tcolumn.column_name); + column->set_has_bitmap_index(false); string data_type; EnumToString(TPrimitiveType, tcolumn.column_type.type, data_type); column->set_type(data_type); @@ -152,6 +154,17 @@ TabletMeta::TabletMeta(int64_t table_id, int64_t partition_id, column->set_is_bf_column(tcolumn.is_bloom_filter_column); has_bf_columns = true; } + if (tablet_schema.__isset.indexes) { + for (auto& index : tablet_schema.indexes) { + if (index.index_type == TIndexType::type::BITMAP) { + DCHECK_EQ(index.columns.size(), 1); + if (boost::iequals(tcolumn.column_name, index.columns[0])) { + column->set_has_bitmap_index(true); + break; + } + } + } + } } schema->set_next_column_unique_id(next_unique_id); diff --git a/be/src/olap/tablet_schema.cpp b/be/src/olap/tablet_schema.cpp index d368e279df1410..e8c52f3de87cf6 100644 --- a/be/src/olap/tablet_schema.cpp +++ b/be/src/olap/tablet_schema.cpp @@ -295,6 +295,11 @@ OLAPStatus TabletColumn::init_from_pb(const ColumnPB& column) { } else { _is_bf_column = false; } + if (column.has_has_bitmap_index()) { + _has_bitmap_index = column.has_bitmap_index(); + } else { + _has_bitmap_index = false; + } _has_referenced_column = column.has_referenced_column_id(); if (_has_referenced_column) { _referenced_column_id = column.referenced_column_id(); @@ -327,6 +332,9 @@ OLAPStatus TabletColumn::to_schema_pb(ColumnPB* column) { if (_has_referenced_column) { column->set_referenced_column_id(_referenced_column_id); } + if (_has_bitmap_index) { + column->set_has_bitmap_index(_has_bitmap_index); + } return OLAP_SUCCESS; } diff --git a/be/src/olap/tablet_schema.h b/be/src/olap/tablet_schema.h index 8b7b0f2c0c296c..5346ab0560904b 100644 --- a/be/src/olap/tablet_schema.h +++ b/be/src/olap/tablet_schema.h @@ -40,6 +40,7 @@ class TabletColumn { inline bool is_key() const { return _is_key; } inline bool is_nullable() const { return _is_nullable; } inline bool is_bf_column() const { return _is_bf_column; } + inline bool has_bitmap_index() const {return _has_bitmap_index; } bool has_default_value() const { return _has_default_value; } std::string default_value() const { return _default_value; } bool has_reference_column() const { return _has_referenced_column; } @@ -80,6 +81,8 @@ class TabletColumn { bool _has_referenced_column; int32_t _referenced_column_id; std::string _referenced_column; + + bool _has_bitmap_index = false; }; class TabletSchema { diff --git a/be/src/util/CMakeLists.txt b/be/src/util/CMakeLists.txt index 144ead741c6952..899600bb1aff7e 100644 --- a/be/src/util/CMakeLists.txt +++ b/be/src/util/CMakeLists.txt @@ -84,6 +84,7 @@ set(UTIL_FILES minizip/ioapi.c minizip/unzip.c zip_util.cpp + utf8_check.cpp ) if (WITH_MYSQL) diff --git a/be/src/util/cpu_info.cpp b/be/src/util/cpu_info.cpp index 7bdb120e3d9ec6..2b7f9cce9e5b22 100755 --- a/be/src/util/cpu_info.cpp +++ b/be/src/util/cpu_info.cpp @@ -17,11 +17,23 @@ #include "util/cpu_info.h" -#ifdef __APPLE__ -#include +#if defined(__GNUC__) && (defined(__x86_64__) || defined(__i386__)) +/* GCC-compatible compiler, targeting x86/x86-64 */ +#include +#elif defined(__GNUC__) && defined(__ARM_NEON__) +/* GCC-compatible compiler, targeting ARM with NEON */ +#include +#elif defined(__GNUC__) && defined(__IWMMXT__) +/* GCC-compatible compiler, targeting ARM with WMMX */ +#include +#elif (defined(__GNUC__) || defined(__xlC__)) && (defined(__VEC__) || defined(__ALTIVEC__)) +/* XLC or GCC-compatible compiler, targeting PowerPC with VMX/VSX */ +#include +#elif defined(__GNUC__) && defined(__SPE__) +/* GCC-compatible compiler, targeting PowerPC with SPE */ +#include #endif -#include #include #include #include diff --git a/be/src/util/stopwatch.hpp b/be/src/util/stopwatch.hpp index df809eb6af585b..1d52d86f5b3a2a 100644 --- a/be/src/util/stopwatch.hpp +++ b/be/src/util/stopwatch.hpp @@ -23,50 +23,6 @@ namespace doris { -// Utility class to measure time. This is measured using the cpu tick counter which -// is very low overhead but can be inaccurate if the thread is switched away. This -// is useful for measuring cpu time at the row batch level (too much overhead at the -// row granularity). -class StopWatch { -public: - StopWatch() { - _total_time = 0; - _running = false; - } - - void start() { - if (!_running) { - _start = rdtsc(); - _running = true; - } - } - - void stop() { - if (_running) { - _total_time += rdtsc() - _start; - _running = false; - } - } - - // Returns time in cpu ticks. - uint64_t elapsed_time() const { - return _running ? rdtsc() - _start : _total_time; - } - - static uint64_t rdtsc() { - uint32_t lo, hi; - __asm__ __volatile__( - "xorl %%eax,%%eax \n cpuid" - ::: "%rax", "%rbx", "%rcx", "%rdx"); - __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi)); - return (uint64_t)hi << 32 | lo; - } - -private: - uint64_t _start, _total_time; - bool _running; -}; - // Stop watch for reporting elapsed time in nanosec based on CLOCK_MONOTONIC. // It is as fast as Rdtsc. // It is also accurate because it not affected by cpu frequency changes and diff --git a/be/src/util/utf8_check.cpp b/be/src/util/utf8_check.cpp new file mode 100644 index 00000000000000..9fc416143a1d03 --- /dev/null +++ b/be/src/util/utf8_check.cpp @@ -0,0 +1,320 @@ +// Copyright (c) cyb70289(https://github.com/cyb70289). All rights reserved. +// Use of this source code is governed by a MIT license that can be +// found in the LICENSE file. + +/* + * These functions are used for validating utf8 string. + * Details can be seen here: https://github.com/cyb70289/utf8/ + */ + +#include "util/utf8_check.h" + +#include + +#if defined(__i386) || defined(__x86_64__) +#include "util/simdutf8check.h" +#elif defined(__aarch64__) +#include +#endif + +/* + * http://www.unicode.org/versions/Unicode6.0.0/ch03.pdf - page 94 + * + * Table 3-7. Well-Formed UTF-8 Byte Sequences + * + * +--------------------+------------+-------------+------------+-------------+ + * | Code Points | First Byte | Second Byte | Third Byte | Fourth Byte | + * +--------------------+------------+-------------+------------+-------------+ + * | U+0000..U+007F | 00..7F | | | | + * +--------------------+------------+-------------+------------+-------------+ + * | U+0080..U+07FF | C2..DF | 80..BF | | | + * +--------------------+------------+-------------+------------+-------------+ + * | U+0800..U+0FFF | E0 | A0..BF | 80..BF | | + * +--------------------+------------+-------------+------------+-------------+ + * | U+1000..U+CFFF | E1..EC | 80..BF | 80..BF | | + * +--------------------+------------+-------------+------------+-------------+ + * | U+D000..U+D7FF | ED | 80..9F | 80..BF | | + * +--------------------+------------+-------------+------------+-------------+ + * | U+E000..U+FFFF | EE..EF | 80..BF | 80..BF | | + * +--------------------+------------+-------------+------------+-------------+ + * | U+10000..U+3FFFF | F0 | 90..BF | 80..BF | 80..BF | + * +--------------------+------------+-------------+------------+-------------+ + * | U+40000..U+FFFFF | F1..F3 | 80..BF | 80..BF | 80..BF | + * +--------------------+------------+-------------+------------+-------------+ + * | U+100000..U+10FFFF | F4 | 80..8F | 80..BF | 80..BF | + * +--------------------+------------+-------------+------------+-------------+ + */ +namespace doris { +bool validate_utf8_naive(const char *data, size_t len) { + while (len) { + int bytes; + const unsigned char byte1 = data[0]; + + /* 00..7F */ + if (byte1 <= 0x7F) { + bytes = 1; + /* C2..DF, 80..BF */ + } else if (len >= 2 && byte1 >= 0xC2 && byte1 <= 0xDF && + (signed char)data[1] <= (signed char)0xBF) { + bytes = 2; + } else if (len >= 3) { + const unsigned char byte2 = data[1]; + + /* Is byte2, byte3 between 0x80 ~ 0xBF */ + const int byte2_ok = (signed char)byte2 <= (signed char)0xBF; + const int byte3_ok = (signed char)data[2] <= (signed char)0xBF; + + if (byte2_ok && byte3_ok && + /* E0, A0..BF, 80..BF */ + ((byte1 == 0xE0 && byte2 >= 0xA0) || + /* E1..EC, 80..BF, 80..BF */ + (byte1 >= 0xE1 && byte1 <= 0xEC) || + /* ED, 80..9F, 80..BF */ + (byte1 == 0xED && byte2 <= 0x9F) || + /* EE..EF, 80..BF, 80..BF */ + (byte1 >= 0xEE && byte1 <= 0xEF))) { + bytes = 3; + } else if (len >= 4) { + /* Is byte4 between 0x80 ~ 0xBF */ + const int byte4_ok = (signed char)data[3] <= (signed char)0xBF; + + if (byte2_ok && byte3_ok && byte4_ok && + /* F0, 90..BF, 80..BF, 80..BF */ + ((byte1 == 0xF0 && byte2 >= 0x90) || + /* F1..F3, 80..BF, 80..BF, 80..BF */ + (byte1 >= 0xF1 && byte1 <= 0xF3) || + /* F4, 80..8F, 80..BF, 80..BF */ + (byte1 == 0xF4 && byte2 <= 0x8F))) { + bytes = 4; + } else { + return false; + } + } else { + return false; + } + } else { + return false; + } + + len -= bytes; + data += bytes; + } + + return true; +} + +#if defined(__i386) || defined(__x86_64__) +bool validate_utf8(const char *src, size_t len) { + return validate_utf8_fast(src, len); +} +#elif defined(__aarch64__) +/* + * Map high nibble of "First Byte" to legal character length minus 1 + * 0x00 ~ 0xBF --> 0 + * 0xC0 ~ 0xDF --> 1 + * 0xE0 ~ 0xEF --> 2 + * 0xF0 ~ 0xFF --> 3 + */ +const uint8_t _first_len_tbl[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 3, +}; + +/* Map "First Byte" to 8-th item of range table (0xC2 ~ 0xF4) */ +static const uint8_t _first_range_tbl[] = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 8, 8, 8, 8, +}; + +/* + * Range table, map range index to min and max values + * Index 0 : 00 ~ 7F (First Byte, ascii) + * Index 1,2,3: 80 ~ BF (Second, Third, Fourth Byte) + * Index 4 : A0 ~ BF (Second Byte after E0) + * Index 5 : 80 ~ 9F (Second Byte after ED) + * Index 6 : 90 ~ BF (Second Byte after F0) + * Index 7 : 80 ~ 8F (Second Byte after F4) + * Index 8 : C2 ~ F4 (First Byte, non ascii) + * Index 9~15 : illegal: u >= 255 && u <= 0 + */ +static const uint8_t _range_min_tbl[] = { + 0x00, 0x80, 0x80, 0x80, 0xA0, 0x80, 0x90, 0x80, + 0xC2, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, +}; +static const uint8_t _range_max_tbl[] = { + 0x7F, 0xBF, 0xBF, 0xBF, 0xBF, 0x9F, 0xBF, 0x8F, + 0xF4, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, +}; + +/* + * This table is for fast handling four special First Bytes(E0,ED,F0,F4), after + * which the Second Byte are not 80~BF. It contains "range index adjustment". + * - The idea is to minus byte with E0, use the result(0~31) as the index to + * lookup the "range index adjustment". Then add the adjustment to original + * range index to get the correct range. + * - Range index adjustment + * +------------+---------------+------------------+----------------+ + * | First Byte | original range| range adjustment | adjusted range | + * +------------+---------------+------------------+----------------+ + * | E0 | 2 | 2 | 4 | + * +------------+---------------+------------------+----------------+ + * | ED | 2 | 3 | 5 | + * +------------+---------------+------------------+----------------+ + * | F0 | 3 | 3 | 6 | + * +------------+---------------+------------------+----------------+ + * | F4 | 4 | 4 | 8 | + * +------------+---------------+------------------+----------------+ + * - Below is a uint8x16x2 table, data is interleaved in NEON register. So I'm + * putting it vertically. 1st column is for E0~EF, 2nd column for F0~FF. + */ +static const uint8_t _range_adjust_tbl[] = { + /* index -> 0~15 16~31 <- index */ + /* E0 -> */ 2, 3, /* <- F0 */ + 0, 0, + 0, 0, + 0, 0, + 0, 4, /* <- F4 */ + 0, 0, + 0, 0, + 0, 0, + 0, 0, + 0, 0, + 0, 0, + 0, 0, + 0, 0, + /* ED -> */ 3, 0, + 0, 0, + 0, 0, +}; + +/* 2x ~ 4x faster than naive method */ +/* Return true on success, false on error */ +bool utf8_range(const char *data, size_t len) +{ + if (len >= 16) { + uint8x16_t prev_input = vdupq_n_u8(0); + uint8x16_t prev_first_len = vdupq_n_u8(0); + + /* Cached tables */ + const uint8x16_t first_len_tbl = vld1q_u8(_first_len_tbl); + const uint8x16_t first_range_tbl = vld1q_u8(_first_range_tbl); + const uint8x16_t range_min_tbl = vld1q_u8(_range_min_tbl); + const uint8x16_t range_max_tbl = vld1q_u8(_range_max_tbl); + const uint8x16x2_t range_adjust_tbl = vld2q_u8(_range_adjust_tbl); + + /* Cached values */ + const uint8x16_t const_1 = vdupq_n_u8(1); + const uint8x16_t const_2 = vdupq_n_u8(2); + const uint8x16_t const_e0 = vdupq_n_u8(0xE0); + + uint8x16_t error = vdupq_n_u8(0); + + while (len >= 16) { + const uint8x16_t input = vld1q_u8((const uint8_t*)data); + + /* high_nibbles = input >> 4 */ + const uint8x16_t high_nibbles = vshrq_n_u8(input, 4); + + /* first_len = legal character length minus 1 */ + /* 0 for 00~7F, 1 for C0~DF, 2 for E0~EF, 3 for F0~FF */ + /* first_len = first_len_tbl[high_nibbles] */ + const uint8x16_t first_len = + vqtbl1q_u8(first_len_tbl, high_nibbles); + + /* First Byte: set range index to 8 for bytes within 0xC0 ~ 0xFF */ + /* range = first_range_tbl[high_nibbles] */ + uint8x16_t range = vqtbl1q_u8(first_range_tbl, high_nibbles); + + /* Second Byte: set range index to first_len */ + /* 0 for 00~7F, 1 for C0~DF, 2 for E0~EF, 3 for F0~FF */ + /* range |= (first_len, prev_first_len) << 1 byte */ + range = + vorrq_u8(range, vextq_u8(prev_first_len, first_len, 15)); + + /* Third Byte: set range index to saturate_sub(first_len, 1) */ + /* 0 for 00~7F, 0 for C0~DF, 1 for E0~EF, 2 for F0~FF */ + uint8x16_t tmp1, tmp2; + /* tmp1 = saturate_sub(first_len, 1) */ + tmp1 = vqsubq_u8(first_len, const_1); + /* tmp2 = saturate_sub(prev_first_len, 1) */ + tmp2 = vqsubq_u8(prev_first_len, const_1); + /* range |= (tmp1, tmp2) << 2 bytes */ + range = vorrq_u8(range, vextq_u8(tmp2, tmp1, 14)); + + /* Fourth Byte: set range index to saturate_sub(first_len, 2) */ + /* 0 for 00~7F, 0 for C0~DF, 0 for E0~EF, 1 for F0~FF */ + /* tmp1 = saturate_sub(first_len, 2) */ + tmp1 = vqsubq_u8(first_len, const_2); + /* tmp2 = saturate_sub(prev_first_len, 2) */ + tmp2 = vqsubq_u8(prev_first_len, const_2); + /* range |= (tmp1, tmp2) << 3 bytes */ + range = vorrq_u8(range, vextq_u8(tmp2, tmp1, 13)); + + /* + * Now we have below range indices caluclated + * Correct cases: + * - 8 for C0~FF + * - 3 for 1st byte after F0~FF + * - 2 for 1st byte after E0~EF or 2nd byte after F0~FF + * - 1 for 1st byte after C0~DF or 2nd byte after E0~EF or + * 3rd byte after F0~FF + * - 0 for others + * Error cases: + * 9,10,11 if non ascii First Byte overlaps + * E.g., F1 80 C2 90 --> 8 3 10 2, where 10 indicates error + */ + + /* Adjust Second Byte range for special First Bytes(E0,ED,F0,F4) */ + /* See _range_adjust_tbl[] definition for details */ + /* Overlaps lead to index 9~15, which are illegal in range table */ + uint8x16_t shift1 = vextq_u8(prev_input, input, 15); + uint8x16_t pos = vsubq_u8(shift1, const_e0); + range = vaddq_u8(range, vqtbl2q_u8(range_adjust_tbl, pos)); + + /* Load min and max values per calculated range index */ + uint8x16_t minv = vqtbl1q_u8(range_min_tbl, range); + uint8x16_t maxv = vqtbl1q_u8(range_max_tbl, range); + + /* Check value range */ + error = vorrq_u8(error, vcltq_u8(input, minv)); + error = vorrq_u8(error, vcgtq_u8(input, maxv)); + + prev_input = input; + prev_first_len = first_len; + + data += 16; + len -= 16; + } + + /* Delay error check till loop ends */ + if (vmaxvq_u8(error)) + return false; + + /* Find previous token (not 80~BF) */ + uint32_t token4; + vst1q_lane_u32(&token4, vreinterpretq_u32_u8(prev_input), 3); + + const int8_t *token = (const int8_t *)&token4; + int lookahead = 0; + if (token[3] > (int8_t)0xBF) + lookahead = 1; + else if (token[2] > (int8_t)0xBF) + lookahead = 2; + else if (token[1] > (int8_t)0xBF) + lookahead = 3; + + data -= lookahead; + len += lookahead; + } + + /* Check remaining bytes with naive method */ + return validate_utf8_naive(data, len); +} + +bool validate_utf8(const char *src, size_t len) { + return utf8_range(src, len); +} +#else +bool validate_utf8(const char *src, size_t len) { + return validate_utf8_naive(src, len); +} +#endif +} diff --git a/be/src/util/utf8_check.h b/be/src/util/utf8_check.h new file mode 100644 index 00000000000000..57841bf5fc2ca6 --- /dev/null +++ b/be/src/util/utf8_check.h @@ -0,0 +1,32 @@ + +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#ifndef DORIS_BE_SRC_UTIL_UTF8_CHECK_H +#define DORIS_BE_SRC_UTIL_UTF8_CHECK_H + +#include + +namespace doris { +// check utf8 code using simd instructions +// Return true - success, false fail +bool validate_utf8(const char *src, size_t len); +// chech utf8 use naive c++ +bool validate_utf8_naive(const char *data, size_t len); +} // namespce doris + +#endif // DORIS_BE_SRC_UTIL_UTF8_CHECK_H diff --git a/be/test/exprs/bitmap_function_test.cpp b/be/test/exprs/bitmap_function_test.cpp index fa79afd855c62b..d48b93fc749df2 100644 --- a/be/test/exprs/bitmap_function_test.cpp +++ b/be/test/exprs/bitmap_function_test.cpp @@ -28,19 +28,17 @@ namespace doris { -StringVal convert_bitmap_to_string(FunctionContext* ctx, RoaringBitmap& bitmap) { - std::string buf; - buf.resize(bitmap.size()); - bitmap.serialize((char*)buf.c_str()); - return AnyValUtil::from_string_temp(ctx, buf); +StringVal convert_bitmap_to_string(FunctionContext* ctx,RoaringBitmap& bitmap) { + StringVal result(ctx, bitmap.size()); + bitmap.serialize((char*)result.ptr); + return result; } template StringVal convert_bitmap_intersect_to_string(FunctionContext* ctx, BitmapIntersect& intersect) { - std::string buf; - buf.resize(intersect.size()); - intersect.serialize((char*)buf.c_str()); - return AnyValUtil::from_string_temp(ctx, buf); + StringVal result(ctx,intersect.size()); + intersect.serialize((char*)result.ptr); + return result; } class BitmapFunctionsTest : public testing::Test { @@ -248,6 +246,46 @@ TEST_F(BitmapFunctionsTest, test_bitmap_intersect) { } +TEST_F(BitmapFunctionsTest,bitmap_or) { + RoaringBitmap bitmap1(1024); + bitmap1.update(1); + bitmap1.update(2019); + + RoaringBitmap bitmap2(33); + bitmap2.update(44); + bitmap2.update(55); + + StringVal bitmap_src = convert_bitmap_to_string(ctx, bitmap1); + StringVal bitmap_dst = convert_bitmap_to_string(ctx, bitmap2); + + StringVal bitmap_str = BitmapFunctions::bitmap_or(ctx,bitmap_src,bitmap_dst); + BigIntVal result = BitmapFunctions::bitmap_count(ctx,bitmap_str); + + BigIntVal expected(6); + ASSERT_EQ(expected, result); +} + + +TEST_F(BitmapFunctionsTest,bitmap_and) { + RoaringBitmap bitmap1(1024); + bitmap1.update(1); + bitmap1.update(2019); + + RoaringBitmap bitmap2(33); + bitmap2.update(44); + bitmap2.update(2019); + + StringVal bitmap_src = convert_bitmap_to_string(ctx, bitmap1); + StringVal bitmap_dst = convert_bitmap_to_string(ctx, bitmap2); + + StringVal bitmap_str = BitmapFunctions::bitmap_and(ctx,bitmap_src,bitmap_dst); + BigIntVal result = BitmapFunctions::bitmap_count(ctx,bitmap_str); + + BigIntVal expected(1); + ASSERT_EQ(expected, result); +} + + } int main(int argc, char** argv) { diff --git a/be/test/geo/geo_functions_test.cpp b/be/test/geo/geo_functions_test.cpp index f54d84a1086843..9a7779a8646f1b 100644 --- a/be/test/geo/geo_functions_test.cpp +++ b/be/test/geo/geo_functions_test.cpp @@ -136,8 +136,8 @@ TEST_F(GeoFunctionsTest, st_from_wkt) { GeoPoint point; auto res = point.decode_from(str2.ptr, str2.len); ASSERT_TRUE(res); - ASSERT_EQ(10.1, point.x()); - ASSERT_EQ(20.2, point.y()); + ASSERT_DOUBLE_EQ(10.1, point.x()); + ASSERT_DOUBLE_EQ(20.2, point.y()); GeoFunctions::st_from_wkt_close(ctx, FunctionContext::FRAGMENT_LOCAL); } } diff --git a/be/test/olap/rowset/segment_v2/segment_test.cpp b/be/test/olap/rowset/segment_v2/segment_test.cpp index 2f35c2ab6eab9a..81fcfdf1b3d87b 100644 --- a/be/test/olap/rowset/segment_v2/segment_test.cpp +++ b/be/test/olap/rowset/segment_v2/segment_test.cpp @@ -349,16 +349,16 @@ TEST_F(SegmentReaderWriterTest, LazyMaterialization) { } { + tablet_schema = create_schema({ create_int_key(1, true, false, true), create_int_value(2) }); shared_ptr segment; SegmentWriterOptions write_opts; - write_opts.need_bitmap_index = true; // c2 with bitmap index build_segment(write_opts, tablet_schema, tablet_schema, 100, data_gen, &segment); - ASSERT_TRUE(segment->footer().columns(1).has_bitmap_index()); + ASSERT_TRUE(segment->footer().columns(0).has_bitmap_index()); { // lazy disabled when all predicates are removed by bitmap index: // select c1, c2 where c2 = 30; Schema read_schema(tablet_schema); - unique_ptr predicate(new EqualPredicate(1, 200)); + unique_ptr predicate(new EqualPredicate(0, 20)); const vector predicates = { predicate.get() }; OlapReaderStatistics stats; @@ -964,23 +964,24 @@ TEST_F(SegmentReaderWriterTest, TestStringDict) { TEST_F(SegmentReaderWriterTest, TestBitmapPredicate) { TabletSchema tablet_schema = create_schema({ - create_int_key(1), create_int_key(2), create_int_value(3), create_int_value(4)}); + create_int_key(1, true, false, true), + create_int_key(2, true, false, true), + create_int_value(3), + create_int_value(4)}); SegmentWriterOptions opts; - opts.need_bitmap_index = true; // produce bitmap index for value columns 2 and 3 - shared_ptr segment; build_segment(opts, tablet_schema, tablet_schema, 4096, DefaultIntGenerator, &segment); - ASSERT_TRUE(segment->footer().columns(2).has_bitmap_index()); - ASSERT_TRUE(segment->footer().columns(3).has_bitmap_index()); + ASSERT_TRUE(segment->footer().columns(0).has_bitmap_index()); + ASSERT_TRUE(segment->footer().columns(1).has_bitmap_index()); { Schema schema(tablet_schema); - // test where v1=12 + // test where v1=10 { std::vector column_predicates; - std::unique_ptr predicate(new EqualPredicate(2, 12)); + std::unique_ptr predicate(new EqualPredicate(0, 10)); column_predicates.emplace_back(predicate.get()); StorageReadOptions read_opts; @@ -997,11 +998,11 @@ TEST_F(SegmentReaderWriterTest, TestBitmapPredicate) { ASSERT_EQ(read_opts.stats->raw_rows_read, 1); } - // test where v1=12 and v2=13 + // test where v1=10 and v2=11 { std::vector column_predicates; - std::unique_ptr predicate(new EqualPredicate(2, 12)); - std::unique_ptr predicate2(new EqualPredicate(3, 13)); + std::unique_ptr predicate(new EqualPredicate(0, 10)); + std::unique_ptr predicate2(new EqualPredicate(1, 11)); column_predicates.emplace_back(predicate.get()); column_predicates.emplace_back(predicate2.get()); @@ -1019,11 +1020,11 @@ TEST_F(SegmentReaderWriterTest, TestBitmapPredicate) { ASSERT_EQ(read_opts.stats->raw_rows_read, 1); } - // test where v1=12 and v2=15 + // test where v1=10 and v2=15 { std::vector column_predicates; - std::unique_ptr predicate(new EqualPredicate(2, 12)); - std::unique_ptr predicate2(new EqualPredicate(3, 15)); + std::unique_ptr predicate(new EqualPredicate(0, 10)); + std::unique_ptr predicate2(new EqualPredicate(1, 15)); column_predicates.emplace_back(predicate.get()); column_predicates.emplace_back(predicate2.get()); @@ -1040,14 +1041,14 @@ TEST_F(SegmentReaderWriterTest, TestBitmapPredicate) { ASSERT_EQ(read_opts.stats->raw_rows_read, 0); } - // test where v1 in (12,22,1) + // test where v1 in (10,20,1) { std::vector column_predicates; std::set values; - values.insert(12); - values.insert(22); + values.insert(10); + values.insert(20); values.insert(1); - std::unique_ptr predicate(new InListPredicate(2, std::move(values))); + std::unique_ptr predicate(new InListPredicate(0, std::move(values))); column_predicates.emplace_back(predicate.get()); StorageReadOptions read_opts; @@ -1063,13 +1064,13 @@ TEST_F(SegmentReaderWriterTest, TestBitmapPredicate) { ASSERT_EQ(read_opts.stats->raw_rows_read, 2); } - // test where v1 not in (12,22) + // test where v1 not in (10,20) { std::vector column_predicates; std::set values; - values.insert(12); - values.insert(22); - std::unique_ptr predicate(new NotInListPredicate(2, std::move(values))); + values.insert(10); + values.insert(20); + std::unique_ptr predicate(new NotInListPredicate(0, std::move(values))); column_predicates.emplace_back(predicate.get()); StorageReadOptions read_opts; diff --git a/be/test/olap/schema_change_test.cpp b/be/test/olap/schema_change_test.cpp index c329ca1cd91945..da81905bcd517b 100644 --- a/be/test/olap/schema_change_test.cpp +++ b/be/test/olap/schema_change_test.cpp @@ -189,12 +189,14 @@ class TestColumn : public testing::Test { &_stats), OLAP_SUCCESS); } - void AddColumn(std::string name, - std::string type, - std::string aggregation, - uint32_t length, - bool is_allow_null, - bool is_key) { + void SetTabletSchema(const std::string& name, + const std::string& type, + const std::string& aggregation, + uint32_t length, + bool is_allow_null, + bool is_key, + TabletSchema* tablet_schema) { + TabletSchemaPB tablet_schema_pb; ColumnPB* column = tablet_schema_pb.add_column(); column->set_unique_id(0); column->set_name(name); @@ -203,12 +205,6 @@ class TestColumn : public testing::Test { column->set_is_nullable(is_allow_null); column->set_length(length); column->set_aggregation(aggregation); - column->set_precision(1000); - column->set_frac(1000); - column->set_is_bf_column(false); - } - - void InitTablet(TabletSchema* tablet_schema) { tablet_schema->init_from_pb(tablet_schema_pb); } @@ -216,91 +212,45 @@ class TestColumn : public testing::Test { ASSERT_EQ(_column_writer->create_row_index_entry(), OLAP_SUCCESS); } - void test_convert_from_varchar(std::string type_name, int type_size, - std::string normal_value, - std::string overflow_value, - std::string invalid_value="invalid") { - AddColumn( - "VarcharColumn", - "VARCHAR", - "REPLACE", - 255, - false, - true); - - AddColumn( - "ConvertColumn", - type_name, - "REPLACE", - type_size, - false, - false); - + void test_convert_from_varchar(std::string type_name, int type_size, const std::string& value, OLAPStatus expected_st) { TabletSchema tablet_schema; - InitTablet(&tablet_schema); + SetTabletSchema("VarcharColumn", "VARCHAR", "REPLACE", 255, false, false, &tablet_schema); CreateColumnWriter(tablet_schema); RowCursor write_row; write_row.init(tablet_schema); - RowBlock block(&tablet_schema); RowBlockInfo block_info; block_info.row_num = 10000; block.init(block_info); - - Slice normal_str(normal_value); + Slice normal_str(value); write_row.set_field_content(0, reinterpret_cast(&normal_str), _mem_pool.get()); block.set_row(0, write_row); block.finalize(1); ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); - ColumnDataHeaderMessage header; ASSERT_EQ(_column_writer->finalize(&header), OLAP_SUCCESS); + helper.close(); + TabletSchema converted_tablet_schema; + SetTabletSchema("ConvertColumn", type_name, "REPLACE", type_size, false, false, &converted_tablet_schema); CreateColumnReader(tablet_schema); - RowCursor read_row; - read_row.init(tablet_schema); + read_row.init(converted_tablet_schema); _col_vector.reset(new ColumnVector()); ASSERT_EQ(_column_reader->next_vector(_col_vector.get(), 1, _mem_pool.get()), OLAP_SUCCESS); char* data = reinterpret_cast(_col_vector->col_data()); - read_row.set_field_content(0, data, _mem_pool.get()); - const Field* src_field = read_row.column_schema(0); - read_row.convert_from(1, read_row.cell_ptr(0), src_field->type_info(), _mem_pool.get()); - std::string dst_str = read_row.column_schema(1)->to_string(read_row.cell_ptr(1)); - ASSERT_EQ(normal_value, dst_str); - - Slice invalid_str("invalid"); - write_row.set_field_content(0, reinterpret_cast(&invalid_str), _mem_pool.get()); - block.set_row(0, write_row); - block.finalize(1); - ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); - - _col_vector.reset(new ColumnVector()); - ASSERT_EQ(_column_reader->next_vector(_col_vector.get(), 1, _mem_pool.get()), OLAP_SUCCESS); - data = reinterpret_cast(_col_vector->col_data()); - read_row.set_field_content(0, data, _mem_pool.get()); - const Field* src_field2 = read_row.column_schema(0); - ASSERT_EQ(read_row.convert_from(1, read_row.cell_ptr(0), src_field2->type_info(), _mem_pool.get()), OLAP_ERR_INVALID_SCHEMA); - - Slice overflow_str(overflow_value); - write_row.set_field_content(0, reinterpret_cast(&overflow_str), _mem_pool.get()); - block.set_row(0, write_row); - block.finalize(1); - ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); - - _col_vector.reset(new ColumnVector()); - ASSERT_EQ(_column_reader->next_vector(_col_vector.get(), 1, _mem_pool.get()), OLAP_SUCCESS); - data = reinterpret_cast(_col_vector->col_data()); - read_row.set_field_content(0, data, _mem_pool.get()); - const Field* src_field3 = read_row.column_schema(0); - ASSERT_EQ(read_row.convert_from(1, read_row.cell_ptr(0), src_field3->type_info(), _mem_pool.get()), OLAP_ERR_INVALID_SCHEMA); + auto st = read_row.convert_from(0, data, write_row.column_schema(0)->type_info(), _mem_pool.get()); + ASSERT_EQ(st, expected_st); + if (st == OLAP_SUCCESS) { + std::string dst_str = read_row.column_schema(0)->to_string(read_row.cell_ptr(0)); + ASSERT_TRUE(dst_str.compare(0, value.size(), value) == 0); + } TypeInfo* tp = get_type_info(OLAP_FIELD_TYPE_HLL); - OLAPStatus st = read_row.convert_from(1, read_row.cell_ptr(0), tp, _mem_pool.get()); + st = read_row.convert_from(0, read_row.cell_ptr(0), tp, _mem_pool.get()); ASSERT_EQ(st, OLAP_ERR_INVALID_SCHEMA); - } ColumnWriter *_column_writer; @@ -312,8 +262,6 @@ class TestColumn : public testing::Test { OutStreamFactory *_stream_factory; - TabletSchemaPB tablet_schema_pb; - std::vector _offsets; std::vector _present_buffers; std::vector _data_buffers; @@ -328,24 +276,8 @@ class TestColumn : public testing::Test { TEST_F(TestColumn, ConvertFloatToDouble) { - // write data - AddColumn( - "FloatColumn", - "FLOAT", - "REPLACE", - 4, - false, - true); - AddColumn( - "DoubleColumn", - "DOUBLE", - "REPLACE", - 4, - false, - false); - TabletSchema tablet_schema; - InitTablet(&tablet_schema); + SetTabletSchema("FloatColumn", "FLOAT", "REPLACE", 4, false, false, &tablet_schema); CreateColumnWriter(tablet_schema); RowCursor write_row; @@ -372,65 +304,47 @@ TEST_F(TestColumn, ConvertFloatToDouble) { ASSERT_EQ(_column_writer->finalize(&header), OLAP_SUCCESS); // read data + TabletSchema convert_tablet_schema; + SetTabletSchema("DoubleColumn", "DOUBLE", "REPLACE", 4, false, false, &convert_tablet_schema); CreateColumnReader(tablet_schema); - RowCursor read_row; - read_row.init(tablet_schema); - + read_row.init(convert_tablet_schema); _col_vector.reset(new ColumnVector()); ASSERT_EQ(_column_reader->next_vector( _col_vector.get(), 2, _mem_pool.get()), OLAP_SUCCESS); char* data = reinterpret_cast(_col_vector->col_data()); - read_row.set_field_content(0, data, _mem_pool.get()); - read_row.convert_from(1, data, read_row.column_schema(0)->type_info(), _mem_pool.get()); - //float val1 = *reinterpret_cast( read_row.cell_ptr(0)); - double val2 = *reinterpret_cast( read_row.cell_ptr(1)); + read_row.convert_from(0, data, write_row.column_schema(0)->type_info(), _mem_pool.get()); + //float val1 = *reinterpret_cast(read_row.cell_ptr(0)); + double val2 = *reinterpret_cast(read_row.cell_ptr(0)); char buf[64]; memset(buf,0,sizeof(buf)); sprintf(buf,"%f",val2); char* tg; double v2 = strtod(buf,&tg); - ASSERT_TRUE( v2 == 1.234 ); + ASSERT_TRUE(v2 == 1.234); //test not support type TypeInfo* tp = get_type_info(OLAP_FIELD_TYPE_HLL); - OLAPStatus st = read_row.convert_from(1, data, tp, _mem_pool.get()); + OLAPStatus st = read_row.convert_from(0, data, tp, _mem_pool.get()); ASSERT_TRUE( st == OLAP_ERR_INVALID_SCHEMA); } TEST_F(TestColumn, ConvertDatetimeToDate) { - // write data - AddColumn( - "DatetimeColumn", - "DATETIME", - "REPLACE", - 8, - false, - true); - AddColumn( - "DateColumn", - "DATE", - "REPLACE", - 3, - false, - false); - TabletSchema tablet_schema; - InitTablet( &tablet_schema ); + SetTabletSchema("DatetimeColumn", "DATETIME", "REPLACE", 8, false, false, &tablet_schema); CreateColumnWriter(tablet_schema); - + RowCursor write_row; write_row.init(tablet_schema); - RowBlock block(&tablet_schema); RowBlockInfo block_info; block_info.row_num = 10000; block.init(block_info); - std::vector val_string_array; - val_string_array.push_back("2019-11-25 19:07:00"); - val_string_array.push_back("2019-11-24"); + std::vector val_string_array; + std::string origin_val = "2019-11-25 19:07:00"; + val_string_array.emplace_back(origin_val); OlapTuple tuple(val_string_array); write_row.from_tuple(tuple); block.set_row(0, write_row); @@ -441,47 +355,29 @@ TEST_F(TestColumn, ConvertDatetimeToDate) { ASSERT_EQ(_column_writer->finalize(&header), OLAP_SUCCESS); // read data + TabletSchema convert_tablet_schema; + SetTabletSchema("DateColumn", "DATE", "REPLACE", 3, false, false, &convert_tablet_schema); CreateColumnReader(tablet_schema); - RowCursor read_row; - read_row.init(tablet_schema); + read_row.init(convert_tablet_schema); _col_vector.reset(new ColumnVector()); ASSERT_EQ(_column_reader->next_vector( _col_vector.get(), 1, _mem_pool.get()), OLAP_SUCCESS); char* data = reinterpret_cast(_col_vector->col_data()); - read_row.set_field_content(0, data, _mem_pool.get()); - char* src = read_row.cell_ptr(0); - const Field* src_field = read_row.column_schema(0); - read_row.convert_from(1,src, src_field->type_info(), _mem_pool.get()); - read_row.cell_ptr(1); - std::string dest_string = read_row.column_schema(1)->to_string(read_row.cell_ptr(1)); + read_row.convert_from(0 , data, write_row.column_schema(0)->type_info(), _mem_pool.get()); + std::string dest_string = read_row.column_schema(0)->to_string(read_row.cell_ptr(0)); ASSERT_TRUE(strncmp(dest_string.c_str(), "2019-11-25", strlen("2019-11-25")) == 0); //test not support type TypeInfo* tp = get_type_info(OLAP_FIELD_TYPE_HLL); - OLAPStatus st = read_row.convert_from(1, src, tp, _mem_pool.get()); + OLAPStatus st = read_row.convert_from(0, data, tp, _mem_pool.get()); ASSERT_TRUE( st == OLAP_ERR_INVALID_SCHEMA); } TEST_F(TestColumn, ConvertDateToDatetime) { - AddColumn( - "DateColumn", - "DATE", - "REPLACE", - 3, - false, - true); - AddColumn( - "DateTimeColumn", - "DATETIME", - "REPLACE", - 8, - false, - false); - TabletSchema tablet_schema; - InitTablet(&tablet_schema); + SetTabletSchema("DateColumn", "DATE", "REPLACE", 3, false, false, &tablet_schema); CreateColumnWriter(tablet_schema); RowCursor write_row; @@ -494,9 +390,7 @@ TEST_F(TestColumn, ConvertDateToDatetime) { std::vector val_string_array; std::string origin_val = "2019-12-04"; - std::string convert_val = "2019-12-04 00:00:00"; val_string_array.emplace_back(origin_val); - val_string_array.emplace_back(convert_val); OlapTuple tuple(val_string_array); write_row.from_tuple(tuple); block.set_row(0, write_row); @@ -506,47 +400,29 @@ TEST_F(TestColumn, ConvertDateToDatetime) { ColumnDataHeaderMessage header_message; ASSERT_EQ(_column_writer->finalize(&header_message), OLAP_SUCCESS); + TabletSchema convert_tablet_schema; + SetTabletSchema("DateTimeColumn", "DATETIME", "REPLACE", 8, false, false, &convert_tablet_schema); CreateColumnReader(tablet_schema); RowCursor read_row; - read_row.init(tablet_schema); + read_row.init(convert_tablet_schema); _col_vector.reset(new ColumnVector()); ASSERT_EQ(_column_reader->next_vector( _col_vector.get(), 1, _mem_pool.get()), OLAP_SUCCESS); char* data = reinterpret_cast(_col_vector->col_data()); read_row.set_field_content(0, data, _mem_pool.get()); - char* src = read_row.cell_ptr(0); - const Field* src_field = read_row.column_schema(0); - read_row.convert_from(1, src, src_field->type_info(), _mem_pool.get()); - read_row.cell_ptr(1); - std::string dest_string = read_row.column_schema(1)->to_string(read_row.cell_ptr(1)); - ASSERT_TRUE(dest_string.compare(convert_val) == 0); + read_row.convert_from(0, data, write_row.column_schema(0)->type_info(), _mem_pool.get()); + std::string dest_string = read_row.column_schema(0)->to_string(read_row.cell_ptr(0)); + ASSERT_TRUE(dest_string.compare("2019-12-04 00:00:00") == 0); //test not support type TypeInfo* tp = get_type_info(OLAP_FIELD_TYPE_HLL); - OLAPStatus st = read_row.convert_from(1, src, tp, _mem_pool.get()); + OLAPStatus st = read_row.convert_from(0, data, tp, _mem_pool.get()); ASSERT_TRUE( st == OLAP_ERR_INVALID_SCHEMA); } TEST_F(TestColumn, ConvertIntToDate) { - AddColumn( - "IntColumn", - "INT", - "REPLACE", - 4, - false, - true); - - AddColumn( - "DateColumn", - "DATE", - "REPLACE", - 3, - false, - false); - - TabletSchema tablet_schema; - InitTablet(&tablet_schema); + SetTabletSchema("IntColumn", "INT", "REPLACE", 4, false, false, &tablet_schema); CreateColumnWriter(tablet_schema); RowCursor write_row; @@ -566,48 +442,30 @@ TEST_F(TestColumn, ConvertIntToDate) { ColumnDataHeaderMessage header; ASSERT_EQ(_column_writer->finalize(&header), OLAP_SUCCESS); - // read data + TabletSchema convert_tablet_schema; + SetTabletSchema("DateColumn", "DATE", "REPLACE", 3, false, false, &convert_tablet_schema); CreateColumnReader(tablet_schema); RowCursor read_row; - read_row.init(tablet_schema); + read_row.init(convert_tablet_schema); _col_vector.reset(new ColumnVector()); ASSERT_EQ(_column_reader->next_vector( _col_vector.get(), 1, _mem_pool.get()), OLAP_SUCCESS); char* data = reinterpret_cast(_col_vector->col_data()); - read_row.set_field_content(0, data, _mem_pool.get()); - const Field* src_field = read_row.column_schema(0); - read_row.convert_from(1, read_row.cell_ptr(0), src_field->type_info(), _mem_pool.get()); - std::string dest_string = read_row.column_schema(1)->to_string(read_row.cell_ptr(1)); + read_row.convert_from(0, data, write_row.column_schema(0)->type_info(), _mem_pool.get()); + std::string dest_string = read_row.column_schema(0)->to_string(read_row.cell_ptr(0)); ASSERT_TRUE(strncmp(dest_string.c_str(), "2019-12-05", strlen("2019-12-05")) == 0); //test not support type TypeInfo* tp = get_type_info(OLAP_FIELD_TYPE_HLL); - OLAPStatus st = read_row.convert_from(1, read_row.cell_ptr(0), tp, _mem_pool.get()); + OLAPStatus st = read_row.convert_from(0, read_row.cell_ptr(0), tp, _mem_pool.get()); ASSERT_TRUE( st == OLAP_ERR_INVALID_SCHEMA); } TEST_F(TestColumn, ConvertVarcharToDate) { - AddColumn( - "VarcharColumn", - "VARCHAR", - "REPLACE", - 255, - false, - true); - - AddColumn( - "DateColumn", - "DATE", - "REPLACE", - 3, - false, - false); - - TabletSchema tablet_schema; - InitTablet(&tablet_schema); + SetTabletSchema("VarcharColumn", "VARCHAR", "REPLACE", 255, false, false, &tablet_schema); CreateColumnWriter(tablet_schema); RowCursor write_row; @@ -637,71 +495,72 @@ TEST_F(TestColumn, ConvertVarcharToDate) { ColumnDataHeaderMessage header; ASSERT_EQ(_column_writer->finalize(&header), OLAP_SUCCESS); +<<<<<<< HEAD + // because file_helper is reused in this case, we should close it. + helper.close(); + TabletSchema convert_tablet_schema; + SetTabletSchema("DateColumn", "DATE", "REPLACE", 3, false, false, &convert_tablet_schema); +======= +>>>>>>> parent of 16482269... Adapt arrow 0.15 API (#2657) CreateColumnReader(tablet_schema); - RowCursor read_row; - read_row.init(tablet_schema); + read_row.init(convert_tablet_schema); _col_vector.reset(new ColumnVector()); ASSERT_EQ(_column_reader->next_vector(_col_vector.get(), 1, _mem_pool.get()), OLAP_SUCCESS); char *data = reinterpret_cast(_col_vector->col_data()); - read_row.set_field_content(0, data, _mem_pool.get()); - const Field *src_field = read_row.column_schema(0); - read_row.convert_from(1, read_row.cell_ptr(0), src_field->type_info(), _mem_pool.get()); - std::string dst_str = read_row.column_schema(1)->to_string(read_row.cell_ptr(1)); + read_row.convert_from(0, data, write_row.column_schema(0)->type_info(), _mem_pool.get()); + std::string dst_str = read_row.column_schema(0)->to_string(read_row.cell_ptr(0)); ASSERT_EQ(expected_val, dst_str); } - - // test invalid schema change - Slice invalid_str("invalid"); - write_row.set_field_content(0, reinterpret_cast(&invalid_str), _mem_pool.get()); - block.set_row(0, write_row); - block.finalize(1); - ASSERT_EQ(_column_writer->write_batch(&block, &write_row), OLAP_SUCCESS); - - _col_vector.reset(new ColumnVector()); - ASSERT_EQ(_column_reader->next_vector(_col_vector.get(), 1, _mem_pool.get()), OLAP_SUCCESS); - char* data = reinterpret_cast(_col_vector->col_data()); + helper.close(); + TabletSchema convert_tablet_schema; + SetTabletSchema("DateColumn", "DATE", "REPLACE", 3, false, false, &convert_tablet_schema); + CreateColumnReader(tablet_schema); RowCursor read_row; - read_row.init(tablet_schema); - read_row.set_field_content(0, data, _mem_pool.get()); - const Field* src_field2 = read_row.column_schema(0); - ASSERT_EQ(read_row.convert_from(1, read_row.cell_ptr(0), src_field2->type_info(), _mem_pool.get()), OLAP_ERR_INVALID_SCHEMA); + read_row.init(convert_tablet_schema); //test not support type TypeInfo* tp = get_type_info(OLAP_FIELD_TYPE_HLL); - OLAPStatus st = read_row.convert_from(1, read_row.cell_ptr(0), tp, _mem_pool.get()); + OLAPStatus st = read_row.convert_from(0, read_row.cell_ptr(0), tp, _mem_pool.get()); ASSERT_EQ(st, OLAP_ERR_INVALID_SCHEMA); } TEST_F(TestColumn, ConvertVarcharToTinyInt) { - test_convert_from_varchar("TINYINT", 1, "127", "128"); + test_convert_from_varchar("TINYINT", 1, "127", OLAP_SUCCESS); + test_convert_from_varchar("TINYINT", 1, "128", OLAP_ERR_INVALID_SCHEMA); } TEST_F(TestColumn, ConvertVarcharToSmallInt) { - test_convert_from_varchar("SMALLINT", 2, "32767", "32768"); + test_convert_from_varchar("SMALLINT", 2, "32767", OLAP_SUCCESS); + test_convert_from_varchar("SMALLINT", 2, "32768", OLAP_ERR_INVALID_SCHEMA); } TEST_F(TestColumn, ConvertVarcharToInt) { - test_convert_from_varchar("INT", 4, "2147483647", "2147483648"); + test_convert_from_varchar("INT", 4, "2147483647", OLAP_SUCCESS); + test_convert_from_varchar("INT", 4, "2147483648", OLAP_ERR_INVALID_SCHEMA); } TEST_F(TestColumn, ConvertVarcharToBigInt) { - test_convert_from_varchar("BIGINT", 8, "9223372036854775807", "9223372036854775808"); + test_convert_from_varchar("BIGINT", 8, "9223372036854775807", OLAP_SUCCESS); + test_convert_from_varchar("BIGINT", 8, "9223372036854775808", OLAP_ERR_INVALID_SCHEMA); } TEST_F(TestColumn, ConvertVarcharToLargeInt) { - test_convert_from_varchar("LARGEINT", 16, "170141183460469000000000000000000000000", "1701411834604690000000000000000000000000"); + test_convert_from_varchar("LARGEINT", 16, "170141183460469000000000000000000000000", OLAP_SUCCESS); + test_convert_from_varchar("LARGEINT", 16, "1701411834604690000000000000000000000000", OLAP_ERR_INVALID_SCHEMA); } TEST_F(TestColumn, ConvertVarcharToFloat) { - test_convert_from_varchar("FLOAT", 4, "3.40282e+38", "3.40282e+39"); + test_convert_from_varchar("FLOAT", 4, "3.40282e+38", OLAP_SUCCESS); + test_convert_from_varchar("FLOAT", 4, "1797690000000000063230304921389426434930330364336853362154109832891264341489062899406152996321966094455338163203127744334848599000464911410516510916727344709727599413825823048028128827530592629736371829425359826368844446113768685826367454055532068818593409163400929532301499014067384276511218551077374242324480.999", OLAP_ERR_INVALID_SCHEMA); } TEST_F(TestColumn, ConvertVarcharToDouble) { test_convert_from_varchar("DOUBLE", 8, - "179769000000000006323030492138942643493033036433685336215410983289126434148906289940615299632196609445533816320312774433484859900046491141051651091672734470972759941382582304802812882753059262973637182942535982636884444611376868582636745405553206881859340916340092953230149901406738427651121855107737424232448.0000000000", - "1797690000000000063230304921389426434930330364336853362154109832891264341489062899406152996321966094455338163203127744334848599000464911410516510916727344709727599413825823048028128827530592629736371829425359826368844446113768685826367454055532068818593409163400929532301499014067384276511218551077374242324480.0000000000"); + "123.456", OLAP_SUCCESS); + test_convert_from_varchar("DOUBLE", 8, + "1797690000000000063230304921389426434930330364336853362154109832891264341489062899406152996321966094455338163203127744334848599000464911410516510916727344709727599413825823048028128827530592629736371829425359826368844446113768685826367454055532068818593409163400929532301499014067384276511218551077374242324480.0000000000", OLAP_ERR_INVALID_SCHEMA); } } diff --git a/be/test/olap/tablet_schema_helper.h b/be/test/olap/tablet_schema_helper.h index d8cd6f08fa91c4..5329c774c69c12 100644 --- a/be/test/olap/tablet_schema_helper.h +++ b/be/test/olap/tablet_schema_helper.h @@ -24,7 +24,8 @@ namespace doris { -TabletColumn create_int_key(int32_t id, bool is_nullable = true, bool is_bf_column = false) { +TabletColumn create_int_key(int32_t id, bool is_nullable = true, + bool is_bf_column = false, bool has_bitmap_index = false) { TabletColumn column; column._unique_id = id; column._col_name = std::to_string(id); @@ -34,6 +35,7 @@ TabletColumn create_int_key(int32_t id, bool is_nullable = true, bool is_bf_colu column._length = 4; column._index_length = 4; column._is_bf_column = is_bf_column; + column._has_bitmap_index = has_bitmap_index; return column; } @@ -41,7 +43,8 @@ TabletColumn create_int_key(int32_t id, bool is_nullable = true, bool is_bf_colu TabletColumn create_int_value( int32_t id, FieldAggregationMethod agg_method = OLAP_FIELD_AGGREGATION_SUM, - bool is_nullable = true, const std::string default_value = "", bool is_bf_column = false) { + bool is_nullable = true, const std::string default_value = "", + bool is_bf_column = false, bool has_bitmap_index = false) { TabletColumn column; column._unique_id = id; column._col_name = std::to_string(id); @@ -56,6 +59,7 @@ TabletColumn create_int_value( column._default_value = default_value; } column._is_bf_column = is_bf_column; + column._has_bitmap_index = has_bitmap_index; return column; } diff --git a/be/test/util/CMakeLists.txt b/be/test/util/CMakeLists.txt index 56f03fa2e6f904..6b05ec01094659 100644 --- a/be/test/util/CMakeLists.txt +++ b/be/test/util/CMakeLists.txt @@ -53,3 +53,4 @@ ADD_BE_TEST(frame_of_reference_coding_test) ADD_BE_TEST(bit_stream_utils_test) ADD_BE_TEST(radix_sort_test) ADD_BE_TEST(zip_util_test) +ADD_BE_TEST(utf8_check_test) diff --git a/be/test/util/string_parser_test.cpp b/be/test/util/string_parser_test.cpp index dcfb3a6bfe2707..0fdc8c1c14aae3 100644 --- a/be/test/util/string_parser_test.cpp +++ b/be/test/util/string_parser_test.cpp @@ -110,7 +110,7 @@ void test_float_value_is_nan(const std::string& s, StringParser::ParseResult exp EXPECT_EQ(exp_result, result); if (exp_result == StringParser::PARSE_SUCCESS && result == exp_result) { - EXPECT_TRUE(isnan(val)); + EXPECT_TRUE(std::isnan(val)); } } diff --git a/be/test/util/utf8_check_test.cpp b/be/test/util/utf8_check_test.cpp new file mode 100644 index 00000000000000..9619e8e19a9bd7 --- /dev/null +++ b/be/test/util/utf8_check_test.cpp @@ -0,0 +1,124 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +#include "util/utf8_check.h" + +#include + +namespace doris { + +struct test { + const char *data; + int len; +}; + +class Utf8CheckTest : public testing::Test { +public: + Utf8CheckTest() { } + virtual ~Utf8CheckTest() { } +private: + /* positive tests */ + std::vector pos = { + {"", 0}, + {"\x00", 1}, + {"\x66", 1}, + {"\x7F", 1}, + {"\x00\x7F", 2}, + {"\x7F\x00", 2}, + {"\xC2\x80", 2}, + {"\xDF\xBF", 2}, + {"\xE0\xA0\x80", 3}, + {"\xE0\xA0\xBF", 3}, + {"\xED\x9F\x80", 3}, + {"\xEF\x80\xBF", 3}, + {"\xF0\x90\xBF\x80", 4}, + {"\xF2\x81\xBE\x99", 4}, + {"\xF4\x8F\x88\xAA", 4} + }; + + /* negative tests */ + std::vector neg = { + {"\x80", 1}, + {"\xBF", 1}, + {"\xC0\x80", 2}, + {"\xC1\x00", 2}, + {"\xC2\x7F", 2}, + {"\xDF\xC0", 2}, + {"\xE0\x9F\x80", 3}, + {"\xE0\xC2\x80", 3}, + {"\xED\xA0\x80", 3}, + {"\xED\x7F\x80", 3}, + {"\xEF\x80\x00", 3}, + {"\xF0\x8F\x80\x80", 4}, + {"\xF0\xEE\x80\x80", 4}, + {"\xF2\x90\x91\x7F", 4}, + {"\xF4\x90\x88\xAA", 4}, + {"\xF4\x00\xBF\xBF", 4}, + {"\x00\x00\x00\x00\x00\xC2\x80\x00\x00\x00\xE1\x80\x80\x00\x00\xC2" \ + "\xC2\x80\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00", + 32}, + {"\x00\x00\x00\x00\x00\xC2\xC2\x80\x00\x00\xE1\x80\x80\x00\x00\x00", + 16}, + {"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" \ + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xF1\x80", + 32}, + {"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" \ + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xF1", + 32}, + {"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" \ + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xF1\x80" \ + "\x80", 33}, + {"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" \ + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xF1\x80" \ + "\xC2\x80", 34}, + {"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" \ + "\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xF0" \ + "\x80\x80\x80", 35} + }; +}; +TEST_F(Utf8CheckTest, empty) { + ASSERT_TRUE(validate_utf8(pos[0].data, pos[0].len)); +} + +TEST_F(Utf8CheckTest, normal) { + for (int i = 0; i < sizeof(pos)/sizeof(pos[0]); ++i) { + ASSERT_TRUE(validate_utf8(pos[i].data, pos[i].len)); + } +} + +TEST_F(Utf8CheckTest, abnormal) { + for (int i = 0; i < sizeof(neg)/sizeof(neg[0]); ++i) { + ASSERT_FALSE(validate_utf8(neg[i].data, neg[i].len)); + } +} + +TEST_F(Utf8CheckTest, naive) { + for (int i = 0; i < sizeof(pos)/sizeof(pos[0]); ++i) { + ASSERT_TRUE(validate_utf8_naive(pos[i].data, pos[i].len)); + } + for (int i = 0; i < sizeof(neg)/sizeof(neg[0]); ++i) { + std::cout << validate_utf8_naive(neg[i].data, neg[i].len) << std::endl; + ASSERT_FALSE(validate_utf8_naive(neg[i].data, neg[i].len)); + } +} + +} + +int main(int argc, char* argv[]) { + ::testing::InitGoogleTest(&argc, argv); + return RUN_ALL_TESTS(); +} diff --git a/conf/fe.conf b/conf/fe.conf index ab7e320965ad06..9b6e2a0b389dd4 100644 --- a/conf/fe.conf +++ b/conf/fe.conf @@ -41,6 +41,7 @@ http_port = 8030 rpc_port = 9020 query_port = 9030 edit_log_port = 9010 +mysql_service_nio_enabled = true # Choose one if there are more than one ip except loopback address. # Note that there should at most one ip match this list. diff --git a/docs/documentation/cn/getting-started/data-model-rollup.md b/docs/documentation/cn/getting-started/data-model-rollup.md index 66203938187746..733634d93da51d 100644 --- a/docs/documentation/cn/getting-started/data-model-rollup.md +++ b/docs/documentation/cn/getting-started/data-model-rollup.md @@ -61,15 +61,15 @@ Doris 的数据模型主要分为3类: ``` CREATE TABLE IF NOT EXISTS example_db.expamle_tbl ( - `user_id` LARGEINT NOT NULL COMMENT "用户id", - `date` DATE NOT NULL COMMENT "数据灌入日期时间", - `city` VARCHAR(20) COMMENT "用户所在城市", - `age` SMALLINT COMMENT "用户年龄", - `sex` TINYINT COMMENT "用户性别", - `last_visit_date` DATETIME REPLACE DEFAULT "1970-01-01 00:00:00" COMMENT "用户最后一次访问时间", - `cost` BIGINT SUM DEFAULT "0" COMMENT "用户总消费", - `max_dwell_time` INT MAX DEFAULT "0" COMMENT "用户最大停留时间", - `min_dwell_time` INT MIN DEFAULT "99999" COMMENT "用户最小停留时间", + `user_id` LARGEINT NOT NULL COMMENT "用户id", + `date` DATE NOT NULL COMMENT "数据灌入日期时间", + `city` VARCHAR(20) COMMENT "用户所在城市", + `age` SMALLINT COMMENT "用户年龄", + `sex` TINYINT COMMENT "用户性别", + `last_visit_date` DATETIME REPLACE DEFAULT "1970-01-01 00:00:00" COMMENT "用户最后一次访问时间", + `cost` BIGINT SUM DEFAULT "0" COMMENT "用户总消费", + `max_dwell_time` INT MAX DEFAULT "0" COMMENT "用户最大停留时间", + `min_dwell_time` INT MIN DEFAULT "99999" COMMENT "用户最小停留时间", ) AGGREGATE KEY(`user_id`, `date`, `timestamp`, `city`, `age`, `sex`) ... /* 省略 Partition 和 Distribution 信息 */ @@ -130,7 +130,7 @@ AGGREGATE KEY(`user_id`, `date`, `timestamp`, `city`, `age`, `sex`) 前5列没有变化,从第6列 `last_visit_date` 开始: * `2017-10-01 07:00:00`:因为 `last_visit_date` 列的聚合方式为 REPLACE,所以 `2017-10-01 07:00:00` 替换了 `2017-10-01 06:00:00` 保存了下来。 - > 注:在同一个导入批次中的数据,对于 REPLACE 这种聚合方式,替换顺序不做保证。如在这个例子中,最终保存下来的,也有可能是 `2017-10-01 06:00:00`。而对于不同导入批次中的数据,可以保证,后一批次的数据会替换前一批次。 + > 注:在同一个导入批次中的数据,对于 REPLACE 这种聚合方式,替换顺序不做保证。如在这个例子中,最终保存下来的,也有可能是 `2017-10-01 06:00:00`。而对于不同导入批次中的数据,可以保证,后一批次的数据会替换前一批次。 * `35`:因为 `cost` 列的聚合类型为 SUM,所以由 20 + 15 累加获得 35。 * `10`:因为 `max_dwell_time` 列的聚合类型为 MAX,所以 10 和 2 取最大值,获得 10。 @@ -245,14 +245,14 @@ AGGREGATE KEY(`user_id`, `date`, `timestamp`, `city`, `age`, `sex`) ``` CREATE TABLE IF NOT EXISTS example_db.expamle_tbl ( - `user_id` LARGEINT NOT NULL COMMENT "用户id", - `username` VARCHAR(50) NOT NULL COMMENT "用户昵称", - `city` VARCHAR(20) COMMENT "用户所在城市", - `age` SMALLINT COMMENT "用户年龄", - `sex` TINYINT COMMENT "用户性别", - `phone` LARGEINT COMMENT "用户电话", - `address` VARCHAR(500) COMMENT "用户地址", - `register_time` DATETIME COMMENT "用户注册时间" + `user_id` LARGEINT NOT NULL COMMENT "用户id", + `username` VARCHAR(50) NOT NULL COMMENT "用户昵称", + `city` VARCHAR(20) COMMENT "用户所在城市", + `age` SMALLINT COMMENT "用户年龄", + `sex` TINYINT COMMENT "用户性别", + `phone` LARGEINT COMMENT "用户电话", + `address` VARCHAR(500) COMMENT "用户地址", + `register_time` DATETIME COMMENT "用户注册时间" ) UNIQUE KEY(`user_id`, `user_name`) ... /* 省略 Partition 和 Distribution 信息 */ @@ -277,14 +277,14 @@ UNIQUE KEY(`user_id`, `user_name`) ``` CREATE TABLE IF NOT EXISTS example_db.expamle_tbl ( - `user_id` LARGEINT NOT NULL COMMENT "用户id", - `username` VARCHAR(50) NOT NULL COMMENT "用户昵称", - `city` VARCHAR(20) REPLACE COMMENT "用户所在城市", - `age` SMALLINT REPLACE COMMENT "用户年龄", - `sex` TINYINT REPLACE COMMENT "用户性别", - `phone` LARGEINT REPLACE COMMENT "用户电话", - `address` VARCHAR(500) REPLACE COMMENT "用户地址", - `register_time` DATETIME REPLACE COMMENT "用户注册时间" + `user_id` LARGEINT NOT NULL COMMENT "用户id", + `username` VARCHAR(50) NOT NULL COMMENT "用户昵称", + `city` VARCHAR(20) REPLACE COMMENT "用户所在城市", + `age` SMALLINT REPLACE COMMENT "用户年龄", + `sex` TINYINT REPLACE COMMENT "用户性别", + `phone` LARGEINT REPLACE COMMENT "用户电话", + `address` VARCHAR(500) REPLACE COMMENT "用户地址", + `register_time` DATETIME REPLACE COMMENT "用户注册时间" ) AGGREGATE KEY(`user_id`, `user_name`) ... /* 省略 Partition 和 Distribution 信息 */ @@ -311,12 +311,12 @@ AGGREGATE KEY(`user_id`, `user_name`) ``` CREATE TABLE IF NOT EXISTS example_db.expamle_tbl ( - `timestamp` DATETIME NOT NULL COMMENT "日志时间", - `type` INT NOT NULL COMMENT "日志类型", - `error_code` INT COMMENT "错误码", - `error_msg` VARCHAR(1024) COMMENT "错误详细信息", - `op_id` BIGINT COMMENT "负责人id", - `op_time` DATETIME COMMENT "处理时间" + `timestamp` DATETIME NOT NULL COMMENT "日志时间", + `type` INT NOT NULL COMMENT "日志类型", + `error_code` INT COMMENT "错误码", + `error_msg` VARCHAR(1024) COMMENT "错误详细信息", + `op_id` BIGINT COMMENT "负责人id", + `op_time` DATETIME COMMENT "处理时间" ) DUPLICATE KEY(`timestamp`, `type`) ... /* 省略 Partition 和 Distribution 信息 */ diff --git a/docs/documentation/cn/sql-reference/sql-functions/aggregate-functions/bitmap.md b/docs/documentation/cn/sql-reference/sql-functions/aggregate-functions/bitmap.md index ebe706d0e3a03b..bc800de55ab212 100644 --- a/docs/documentation/cn/sql-reference/sql-functions/aggregate-functions/bitmap.md +++ b/docs/documentation/cn/sql-reference/sql-functions/aggregate-functions/bitmap.md @@ -105,7 +105,9 @@ COUNT(DISTINCT expr) 相同 filter_column 过滤条件的多个 bitmap 的交集的基数值。 bitmap_column_to_count 是 bitmap 类型的列,filter_column 是变化的维度列,filter_values 是维度取值列表 +`BITMAP_OR(expr,expr)`: 计算两个Bitmap列的并集,返回值是序列化后 Bitmap 值 +`BITMAP_AND(expr,expr)`:计算两个Bitmap列的交集,返回值是序列化后 Bitmap 值 注意: 1. BITMAP_UNION 函数的参数目前仅支持: diff --git a/docs/documentation/cn/sql-reference/sql-statements/Administration/SHOW INDEX.md b/docs/documentation/cn/sql-reference/sql-statements/Administration/SHOW INDEX.md new file mode 100644 index 00000000000000..8bad66be2f0f66 --- /dev/null +++ b/docs/documentation/cn/sql-reference/sql-statements/Administration/SHOW INDEX.md @@ -0,0 +1,35 @@ + + +# SHOW INDEX + +## description + + 该语句用于展示一个表中索引的相关信息,目前只支持bitmap 索引 + 语法: + SHOW INDEX[ES] FROM [db_name.]table_name; + +## example + + 1. 展示指定 table_name 的下索引 + SHOW INDEX FROM example_db.table_name; + +## keyword + + SHOW,INDEX diff --git a/docs/documentation/cn/sql-reference/sql-statements/Data Definition/ALTER TABLE.md b/docs/documentation/cn/sql-reference/sql-statements/Data Definition/ALTER TABLE.md index 4957a88a44dfd5..99ccd87bc10842 100644 --- a/docs/documentation/cn/sql-reference/sql-statements/Data Definition/ALTER TABLE.md +++ b/docs/documentation/cn/sql-reference/sql-statements/Data Definition/ALTER TABLE.md @@ -18,7 +18,9 @@ under the License. --> # ALTER TABLE + ## description + 该语句用于对已有的 table 进行修改。如果没有指定 rollup index,默认操作 base index。 该语句分为三种操作类型: schema change 、rollup 、partition 这三种操作类型不能同时出现在一条 ALTER TABLE 语句中。 @@ -29,7 +31,7 @@ under the License. ALTER TABLE [database.]table alter_clause1[, alter_clause2, ...]; - alter_clause 分为 partition 、rollup、schema change 和 rename 四种。 + alter_clause 分为 partition 、rollup、schema change、rename 和index五种。 partition 支持如下几种修改方式 1. 增加分区 @@ -163,8 +165,19 @@ under the License. 3. 修改 partition 名称 语法: RENAME PARTITION old_partition_name new_partition_name; - + bitmap index 支持如下几种修改方式 + 1. 创建bitmap 索引 + 语法: + ADD INDEX index_name [USING BITMAP] (column [, ...],) [COMMENT 'balabala']; + 注意: + 1. 目前仅支持bitmap 索引 + 1. BITMAP 索引仅在单列上创建 + 2. 删除索引 + 语法: + DROP INDEX index_name; + ## example + [partition] 1. 增加分区, 现有分区 [MIN, 2013-01-01),增加分区 [2013-01-01, 2014-01-01),使用默认分桶方式 ALTER TABLE example_db.my_table @@ -282,7 +295,12 @@ under the License. 3. 将表 example_table 中名为 p1 的 partition 修改为 p2 ALTER TABLE example_table RENAME PARTITION p1 p2; - + [index] + 1. 在table1 上为siteid 创建bitmap 索引 + ALTER TABLE table1 ADD INDEX index_name [USING BITMAP] (siteid) COMMENT 'balabala'; + 2. 删除table1 上的siteid列的bitmap 索引 + ALTER TABLE table1 DROP INDEX index_name; + ## keyword + ALTER,TABLE,ROLLUP,COLUMN,PARTITION,RENAME - diff --git a/docs/documentation/cn/sql-reference/sql-statements/Data Definition/CREATE INDEX.md b/docs/documentation/cn/sql-reference/sql-statements/Data Definition/CREATE INDEX.md new file mode 100644 index 00000000000000..9767f015c927f6 --- /dev/null +++ b/docs/documentation/cn/sql-reference/sql-statements/Data Definition/CREATE INDEX.md @@ -0,0 +1,38 @@ + + +# CREATE INDEX + +## description + + 该语句用于创建索引 + 语法: + CREATE INDEX index_name ON table_name (column [, ...],) [USING BITMAP] [COMMENT'balabala']; + 注意: + 1. 目前只支持bitmap 索引 + 2. BITMAP 索引仅在单列上创建 + +## example + + 1. 在table1 上为siteid 创建bitmap 索引 + CREATE INDEX index_name ON table1 (siteid) USING BITMAP COMMENT 'balabala'; + +## keyword + + CREATE,INDEX diff --git a/docs/documentation/cn/sql-reference/sql-statements/Data Definition/CREATE TABLE.md b/docs/documentation/cn/sql-reference/sql-statements/Data Definition/CREATE TABLE.md index 4c8fdb776c7882..58375c4c1600bb 100644 --- a/docs/documentation/cn/sql-reference/sql-statements/Data Definition/CREATE TABLE.md +++ b/docs/documentation/cn/sql-reference/sql-statements/Data Definition/CREATE TABLE.md @@ -18,136 +18,163 @@ under the License. --> # CREATE TABLE + ## description - 该语句用于创建 table。 + +该语句用于创建 table。 +语法: + +``` + CREATE [EXTERNAL] TABLE [IF NOT EXISTS] [database.]table_name + (column_definition1[, column_definition2, ...] + [, index_definition1[, ndex_definition12,]]) + [ENGINE = [olap|mysql|broker]] + [key_desc] + [COMMENT "table comment"]; + [partition_desc] + [distribution_desc] + [PROPERTIES ("key"="value", ...)] + [BROKER PROPERTIES ("key"="value", ...)] +``` + +1. column_definition 语法: - CREATE [EXTERNAL] TABLE [IF NOT EXISTS] [database.]table_name - (column_definition1[, column_definition2, ...]) - [ENGINE = [olap|mysql|broker]] - [key_desc] - [COMMENT "table comment"]; - [partition_desc] - [distribution_desc] - [PROPERTIES ("key"="value", ...)] - [BROKER PROPERTIES ("key"="value", ...)] - - 1. column_definition - 语法: - col_name col_type [agg_type] [NULL | NOT NULL] [DEFAULT "default_value"] - - 说明: - col_name:列名称 - col_type:列类型 - TINYINT(1字节) - 范围:-2^7 + 1 ~ 2^7 - 1 - SMALLINT(2字节) - 范围:-2^15 + 1 ~ 2^15 - 1 - INT(4字节) - 范围:-2^31 + 1 ~ 2^31 - 1 - BIGINT(8字节) - 范围:-2^63 + 1 ~ 2^63 - 1 - LARGEINT(16字节) - 范围:-2^127 + 1 ~ 2^127 - 1 - FLOAT(4字节) - 支持科学计数法 - DOUBLE(12字节) - 支持科学计数法 - DECIMAL[(precision, scale)] (16字节) - 保证精度的小数类型。默认是 DECIMAL(10, 0) - precision: 1 ~ 27 - scale: 0 ~ 9 - 其中整数部分为 1 ~ 18 - 不支持科学计数法 - DATE(3字节) - 范围:1900-01-01 ~ 9999-12-31 - DATETIME(8字节) - 范围:1900-01-01 00:00:00 ~ 9999-12-31 23:59:59 - CHAR[(length)] - 定长字符串。长度范围:1 ~ 255。默认为1 - VARCHAR[(length)] - 变长字符串。长度范围:1 ~ 65533 - HLL (1~16385个字节) - hll列类型,不需要指定长度和默认值、长度根据数据的聚合 - 程度系统内控制,并且HLL列只能通过配套的hll_union_agg、Hll_cardinality、hll_hash进行查询或使用 - BITMAP - bitmap 列类型,不需要指定长度和默认值 - BITMAP 列只能通过配套的 BITMAP_UNION、BITMAP_COUNT、TO_BITMAP 进行查询或使用 - - agg_type:聚合类型,如果不指定,则该列为 key 列。否则,该列为 value 列 - - * SUM、MAX、MIN、REPLACE - * HLL_UNION(仅用于HLL列,为HLL独有的聚合方式)、 - * BITMAP_UNION(仅用于 BITMAP 列,为 BITMAP 独有的聚合方式)、 - * REPLACE_IF_NOT_NULL:这个聚合类型的含义是当且仅当新导入数据是非NULL值时才会发生替换行为,如果新导入的数据是NULL,那么Doris仍然会保留原值。注意:如果用户在建表时REPLACE_IF_NOT_NULL列指定了NOT NULL,那么Doris仍然会将其转化为NULL,不会向用户报错。用户可以借助这个类型完成部分列导入的功能。 - *该类型只对聚合模型(key_desc的type为AGGREGATE KEY)有用,其它模型不需要指定这个。 - - 是否允许为NULL: 默认不允许为 NULL。NULL 值在导入数据中用 \N 来表示 - - 注意: - BITMAP_UNION聚合类型列在导入时的原始数据类型必须是TINYINT,SMALLINT,INT。 - - 2. ENGINE 类型 - 默认为 olap。可选 mysql, broker - 1) 如果是 mysql,则需要在 properties 提供以下信息: - - PROPERTIES ( - "host" = "mysql_server_host", - "port" = "mysql_server_port", - "user" = "your_user_name", - "password" = "your_password", - "database" = "database_name", - "table" = "table_name" - ) - - 注意: - "table" 条目中的 "table_name" 是 mysql 中的真实表名。 - 而 CREATE TABLE 语句中的 table_name 是该 mysql 表在 Palo 中的名字,可以不同。 - - 在 Palo 创建 mysql 表的目的是可以通过 Palo 访问 mysql 数据库。 - 而 Palo 本身并不维护、存储任何 mysql 数据。 - 2) 如果是 broker,表示表的访问需要通过指定的broker, 需要在 properties 提供以下信息: - PROPERTIES ( - "broker_name" = "broker_name", - "path" = "file_path1[,file_path2]", - "column_separator" = "value_separator" - "line_delimiter" = "value_delimiter" - ) - 另外还需要提供Broker需要的Property信息,通过BROKER PROPERTIES来传递,例如HDFS需要传入 - BROKER PROPERTIES( - "username" = "name", - "password" = "password" - ) - 这个根据不同的Broker类型,需要传入的内容也不相同 - 注意: - "path" 中如果有多个文件,用逗号[,]分割。如果文件名中包含逗号,那么使用 %2c 来替代。如果文件名中包含 %,使用 %25 代替 - 现在文件内容格式支持CSV,支持GZ,BZ2,LZ4,LZO(LZOP) 压缩格式。 + `col_name col_type [agg_type] [NULL | NOT NULL] [DEFAULT "default_value"]` + + 说明: + col_name:列名称 + col_type:列类型 + + ``` + TINYINT(1字节) + 范围:-2^7 + 1 ~ 2^7 - 1 + SMALLINT(2字节) + 范围:-2^15 + 1 ~ 2^15 - 1 + INT(4字节) + 范围:-2^31 + 1 ~ 2^31 - 1 + BIGINT(8字节) + 范围:-2^63 + 1 ~ 2^63 - 1 + LARGEINT(16字节) + 范围:-2^127 + 1 ~ 2^127 - 1 + FLOAT(4字节) + 支持科学计数法 + DOUBLE(12字节) + 支持科学计数法 + DECIMAL[(precision, scale)] (16字节) + 保证精度的小数类型。默认是 DECIMAL(10, 0) + precision: 1 ~ 27 + scale: 0 ~ 9 + 其中整数部分为 1 ~ 18 + 不支持科学计数法 + DATE(3字节) + 范围:1900-01-01 ~ 9999-12-31 + DATETIME(8字节) + 范围:1900-01-01 00:00:00 ~ 9999-12-31 23:59:59 + CHAR[(length)] + 定长字符串。长度范围:1 ~ 255。默认为1 + VARCHAR[(length)] + 变长字符串。长度范围:1 ~ 65533 + HLL (1~16385个字节) + hll列类型,不需要指定长度和默认值、长度根据数据的聚合 + 程度系统内控制,并且HLL列只能通过配套的hll_union_agg、Hll_cardinality、hll_hash进行查询或使用 + BITMAP + bitmap 列类型,不需要指定长度和默认值 + BITMAP 列只能通过配套的 BITMAP_UNION、BITMAP_COUNT、TO_BITMAP 进行查询或使用 + ``` + + agg_type:聚合类型,如果不指定,则该列为 key 列。否则,该列为 value 列 + * SUM、MAX、MIN、REPLACE + * HLL_UNION(仅用于HLL列,为HLL独有的聚合方式)、 + * BITMAP_UNION(仅用于 BITMAP 列,为 BITMAP 独有的聚合方式)、 + * REPLACE_IF_NOT_NULL:这个聚合类型的含义是当且仅当新导入数据是非NULL值时会发生替换行为,如果新导入的数据是NULL,那么Doris仍然会保留原值。注意:如果用在建表时REPLACE_IF_NOT_NULL列指定了NOT NULL,那么Doris仍然会将其转化NULL,不会向用户报错。用户可以借助这个类型完成部分列导入的功能。 + * 该类型只对聚合模型(key_desc的type为AGGREGATE KEY)有用,其它模型不需要指这个。 + + 是否允许为NULL: 默认不允许为 NULL。NULL 值在导入数据中用 \N 来表示 + + 注意: + BITMAP_UNION聚合类型列在导入时的原始数据类型必须是TINYINT,SMALLINT,INT。 + +2. index_definition + 语法: + `INDEX index_name (col_name[, col_name, ...]) [USING BITMAP] COMMENT 'xxxxxx'` + 说明: + index_name:索引名称 + col_name:列名 + 注意: + 当前仅支持BITMAP索引, BITMAP索引仅支持应用于单列 + +3. ENGINE 类型 + 默认为 olap。可选 mysql, broker + 1) 如果是 mysql,则需要在 properties 提供以下信息: + +``` + PROPERTIES ( + "host" = "mysql_server_host", + "port" = "mysql_server_port", + "user" = "your_user_name", + "password" = "your_password", + "database" = "database_name", + "table" = "table_name" + ) +``` + + 注意: + "table" 条目中的 "table_name" 是 mysql 中的真实表名。 + 而 CREATE TABLE 语句中的 table_name 是该 mysql 表在 Palo 中的名字,可以不同。 - 3. key_desc - 语法: - key_type(k1[,k2 ...]) - 说明: - 数据按照指定的key列进行排序,且根据不同的key_type具有不同特性。 - key_type支持一下类型: - AGGREGATE KEY:key列相同的记录,value列按照指定的聚合类型进行聚合, - 适合报表、多维分析等业务场景。 - UNIQUE KEY:key列相同的记录,value列按导入顺序进行覆盖, - 适合按key列进行增删改查的点查询业务。 - DUPLICATE KEY:key列相同的记录,同时存在于Palo中, - 适合存储明细数据或者数据无聚合特性的业务场景。 - 默认为DUPLICATE KEY,key列为列定义中前36个字节, 如果前36个字节的列数小于3,将使用前三列。 - 注意: - 除AGGREGATE KEY外,其他key_type在建表时,value列不需要指定聚合类型。 + 在 Palo 创建 mysql 表的目的是可以通过 Palo 访问 mysql 数据库。 + 而 Palo 本身并不维护、存储任何 mysql 数据。 + 1) 如果是 broker,表示表的访问需要通过指定的broker, 需要在 properties 提供以下信息: + ``` + PROPERTIES ( + "broker_name" = "broker_name", + "path" = "file_path1[,file_path2]", + "column_separator" = "value_separator" + "line_delimiter" = "value_delimiter" + ) + ``` + 另外还需要提供Broker需要的Property信息,通过BROKER PROPERTIES来传递,例如HDFS需要传入 + ``` + BROKER PROPERTIES( + "username" = "name", + "password" = "password" + ) + ``` + 这个根据不同的Broker类型,需要传入的内容也不相同 + 注意: + "path" 中如果有多个文件,用逗号[,]分割。如果文件名中包含逗号,那么使用 %2c 来替代。如果文件名中包含 %,使用 %25 代替 + 现在文件内容格式支持CSV,支持GZ,BZ2,LZ4,LZO(LZOP) 压缩格式。 + +1. key_desc + 语法: + `key_type(k1[,k2 ...])` + 说明: + 数据按照指定的key列进行排序,且根据不同的key_type具有不同特性。 + key_type支持一下类型: + AGGREGATE KEY:key列相同的记录,value列按照指定的聚合类型进行聚合, + 适合报表、多维分析等业务场景。 + UNIQUE KEY:key列相同的记录,value列按导入顺序进行覆盖, + 适合按key列进行增删改查的点查询业务。 + DUPLICATE KEY:key列相同的记录,同时存在于Palo中, + 适合存储明细数据或者数据无聚合特性的业务场景。 + 默认为DUPLICATE KEY,key列为列定义中前36个字节, 如果前36个字节的列数小于3,将使用前三列。 + 注意: + 除AGGREGATE KEY外,其他key_type在建表时,value列不需要指定聚合类型。 - 4. partition_desc - partition描述有两种使用方式 - 1) LESS THAN +2. partition_desc + partition描述有两种使用方式 + 1) LESS THAN 语法: + + ``` PARTITION BY RANGE (k1, k2, ...) ( PARTITION partition_name1 VALUES LESS THAN MAXVALUE|("value1", "value2", ...), PARTITION partition_name2 VALUES LESS THAN MAXVALUE|("value1", "value2", ...) ... ) + ``` + 说明: 使用指定的 key 列和指定的数值范围进行分区。 1) 分区名称仅支持字母开头,字母、数字和下划线组成 @@ -156,260 +183,322 @@ under the License. 3) 分区为左闭右开区间,首个分区的左边界为做最小值 4) NULL 值只会存放在包含最小值的分区中。当包含最小值的分区被删除后,NULL 值将无法导入。 5) 可以指定一列或多列作为分区列。如果分区值缺省,则会默认填充最小值。 - + 注意: 1) 分区一般用于时间维度的数据管理 2) 有数据回溯需求的,可以考虑首个分区为空分区,以便后续增加分区 - - 2)Fixed Range + + 2)Fixed Range 语法: + ``` PARTITION BY RANGE (k1, k2, k3, ...) ( PARTITION partition_name1 VALUES [("k1-lower1", "k2-lower1", "k3-lower1",...), ("k1-upper1", "k2-upper1", "k3-upper1", ...)), PARTITION partition_name2 VALUES [("k1-lower1-2", "k2-lower1-2", ...), ("k1-upper1-2", MAXVALUE, )) "k3-upper1-2", ... ) + ``` 说明: 1)Fixed Range比LESS THAN相对灵活些,左右区间完全由用户自己确定 2)其他与LESS THAN保持同步 - 5. distribution_desc +3. distribution_desc 1) Hash 分桶 语法: - DISTRIBUTED BY HASH (k1[,k2 ...]) [BUCKETS num] + `DISTRIBUTED BY HASH (k1[,k2 ...]) [BUCKETS num]` 说明: 使用指定的 key 列进行哈希分桶。默认分区数为10 - 建议:建议使用Hash分桶方式 + 建议:建议使用Hash分桶方式 - 6. PROPERTIES - 1) 如果 ENGINE 类型为 olap,则可以在 properties 中指定列存(目前我们仅支持列存) +4. PROPERTIES + 1) 如果 ENGINE 类型为 olap,则可以在 properties 中指定列存(目前我们仅支持列存) + ``` + PROPERTIES ( + "storage_type" = "[column]", + ) + ``` - PROPERTIES ( - "storage_type" = "[column]", - ) - - 2) 如果 ENGINE 类型为 olap + 2) 如果 ENGINE 类型为 olap 可以在 properties 设置该表数据的初始存储介质、存储到期时间和副本数。 - - PROPERTIES ( + + ``` + PROPERTIES ( "storage_medium" = "[SSD|HDD]", ["storage_cooldown_time" = "yyyy-MM-dd HH:mm:ss"], ["replication_num" = "3"] ) - - storage_medium: 用于指定该分区的初始存储介质,可选择 SSD 或 HDD。默认为 HDD。 + ``` + + storage_medium: 用于指定该分区的初始存储介质,可选择 SSD 或 HDD。默认为 HDD。 storage_cooldown_time: 当设置存储介质为 SSD 时,指定该分区在 SSD 上的存储到期时间。 默认存放 7 天。 格式为:"yyyy-MM-dd HH:mm:ss" replication_num: 指定分区的副本数。默认为 3 - - 当表为单分区表时,这些属性为表的属性。 + + 当表为单分区表时,这些属性为表的属性。 当表为两级分区时,这些属性为附属于每一个分区。 如果希望不同分区有不同属性。可以通过 ADD PARTITION 或 MODIFY PARTITION 进行操作 - 3) 如果 Engine 类型为 olap, 并且 storage_type 为 column, 可以指定某列使用 bloom filter 索引 + 3) 如果 Engine 类型为 olap, 并且 storage_type 为 column, 可以指定某列使用 bloom filter 索引 bloom filter 索引仅适用于查询条件为 in 和 equal 的情况,该列的值越分散效果越好 目前只支持以下情况的列:除了 TINYINT FLOAT DOUBLE 类型以外的 key 列及聚合方法为 REPLACE 的 value 列 - - PROPERTIES ( + +``` + PROPERTIES ( "bloom_filter_columns"="k1,k2,k3" ) - 4) 如果希望使用Colocate Join 特性,需要在 properties 中指定 +``` + + 4) 如果希望使用Colocate Join 特性,需要在 properties 中指定 - PROPERTIES ( +``` + PROPERTIES ( "colocate_with"="table1" ) - +``` + ## example - 1. 创建一个 olap 表,使用 HASH 分桶,使用列存,相同key的记录进行聚合 - CREATE TABLE example_db.table_hash - ( - k1 TINYINT, - k2 DECIMAL(10, 2) DEFAULT "10.5", - v1 CHAR(10) REPLACE, - v2 INT SUM - ) - ENGINE=olap - AGGREGATE KEY(k1, k2) - COMMENT "my first doris table" - DISTRIBUTED BY HASH(k1) BUCKETS 32 - PROPERTIES ("storage_type"="column"); - - 2. 创建一个 olap 表,使用 Hash 分桶,使用列存,相同key的记录进行覆盖, - 设置初始存储介质和冷却时间 - CREATE TABLE example_db.table_hash - ( - k1 BIGINT, - k2 LARGEINT, - v1 VARCHAR(2048) REPLACE, - v2 SMALLINT SUM DEFAULT "10" - ) - ENGINE=olap - UNIQUE KEY(k1, k2) - DISTRIBUTED BY HASH (k1, k2) BUCKETS 32 - PROPERTIES( - "storage_type"="column", - "storage_medium" = "SSD", - "storage_cooldown_time" = "2015-06-04 00:00:00" - ); - - 3. 创建一个 olap 表,使用 Range 分区,使用Hash分桶,默认使用列存, - 相同key的记录同时存在,设置初始存储介质和冷却时间 - + +1. 创建一个 olap 表,使用 HASH 分桶,使用列存,相同key的记录进行聚合 + + ``` + CREATE TABLE example_db.table_hash + ( + k1 TINYINT, + k2 DECIMAL(10, 2) DEFAULT "10.5", + v1 CHAR(10) REPLACE, + v2 INT SUM + ) + ENGINE=olap + AGGREGATE KEY(k1, k2) + COMMENT "my first doris table" + DISTRIBUTED BY HASH(k1) BUCKETS 32 + PROPERTIES ("storage_type"="column"); + ``` + +2. 创建一个 olap 表,使用 Hash 分桶,使用列存,相同key的记录进行覆盖, + 设置初始存储介质和冷却时间 + + ``` + CREATE TABLE example_db.table_hash + ( + k1 BIGINT, + k2 LARGEINT, + v1 VARCHAR(2048) REPLACE, + v2 SMALLINT SUM DEFAULT "10" + ) + ENGINE=olap + UNIQUE KEY(k1, k2) + DISTRIBUTED BY HASH (k1, k2) BUCKETS 32 + PROPERTIES( + "storage_type"="column", + "storage_medium" = "SSD", + "storage_cooldown_time" = "2015-06-04 00:00:00" + ); + ``` + +3. 创建一个 olap 表,使用 Range 分区,使用Hash分桶,默认使用列存, + 相同key的记录同时存在,设置初始存储介质和冷却时间 + 1)LESS THAN - CREATE TABLE example_db.table_range - ( - k1 DATE, - k2 INT, - k3 SMALLINT, - v1 VARCHAR(2048), - v2 DATETIME DEFAULT "2014-02-04 15:36:00" - ) - ENGINE=olap - DUPLICATE KEY(k1, k2, k3) - PARTITION BY RANGE (k1) - ( - PARTITION p1 VALUES LESS THAN ("2014-01-01"), - PARTITION p2 VALUES LESS THAN ("2014-06-01"), - PARTITION p3 VALUES LESS THAN ("2014-12-01") - ) - DISTRIBUTED BY HASH(k2) BUCKETS 32 - PROPERTIES( - "storage_medium" = "SSD", "storage_cooldown_time" = "2015-06-04 00:00:00" - ); - - 说明: - 这个语句会将数据划分成如下3个分区: - ( { MIN }, {"2014-01-01"} ) - [ {"2014-01-01"}, {"2014-06-01"} ) - [ {"2014-06-01"}, {"2014-12-01"} ) - - 不在这些分区范围内的数据将视为非法数据被过滤 - - 2) Fixed Range - CREATE TABLE table_range - ( - k1 DATE, - k2 INT, - k3 SMALLINT, - v1 VARCHAR(2048), - v2 DATETIME DEFAULT "2014-02-04 15:36:00" - ) - ENGINE=olap - DUPLICATE KEY(k1, k2, k3) - PARTITION BY RANGE (k1, k2, k3) - ( - PARTITION p1 VALUES [("2014-01-01", "10", "200"), ("2014-01-01", "20", "300")), - PARTITION p2 VALUES [("2014-06-01", "100", "200"), ("2014-07-01", "100", "300")) - ) - DISTRIBUTED BY HASH(k2) BUCKETS 32 - PROPERTIES( - "storage_medium" = "SSD" - ); - - 4. 创建一个 mysql 表 - CREATE TABLE example_db.table_mysql - ( - k1 DATE, - k2 INT, - k3 SMALLINT, - k4 VARCHAR(2048), - k5 DATETIME - ) - ENGINE=mysql - PROPERTIES - ( - "host" = "127.0.0.1", - "port" = "8239", - "user" = "mysql_user", - "password" = "mysql_passwd", - "database" = "mysql_db_test", - "table" = "mysql_table_test" - ) - - 5. 创建一个数据文件存储在HDFS上的 broker 外部表, 数据使用 "|" 分割,"\n" 换行 - CREATE EXTERNAL TABLE example_db.table_broker ( - k1 DATE, - k2 INT, - k3 SMALLINT, - k4 VARCHAR(2048), - k5 DATETIME - ) - ENGINE=broker - PROPERTIES ( - "broker_name" = "hdfs", - "path" = "hdfs://hdfs_host:hdfs_port/data1,hdfs://hdfs_host:hdfs_port/data2,hdfs://hdfs_host:hdfs_port/data3%2c4", - "column_separator" = "|", - "line_delimiter" = "\n" - ) - BROKER PROPERTIES ( - "username" = "hdfs_user", - "password" = "hdfs_password" - ) - 6. 创建一张含有HLL列的表 - CREATE TABLE example_db.example_table - ( - k1 TINYINT, - k2 DECIMAL(10, 2) DEFAULT "10.5", - v1 HLL HLL_UNION, - v2 HLL HLL_UNION - ) - ENGINE=olap - AGGREGATE KEY(k1, k2) - DISTRIBUTED BY HASH(k1) BUCKETS 32 - PROPERTIES ("storage_type"="column"); - - 7. 创建一张含有BITMAP_UNION聚合类型的表(v1和v2列的原始数据类型必须是TINYINT,SMALLINT,INT) - CREATE TABLE example_db.example_table - ( - k1 TINYINT, - k2 DECIMAL(10, 2) DEFAULT "10.5", - v1 BITMAP BITMAP_UNION, - v2 BITMAP BITMAP_UNION - ) - ENGINE=olap - AGGREGATE KEY(k1, k2) - DISTRIBUTED BY HASH(k1) BUCKETS 32 - PROPERTIES ("storage_type"="column"); - - 8. 创建两张支持Colocat Join的表t1 和t2 - CREATE TABLE `t1` ( - `id` int(11) COMMENT "", - `value` varchar(8) COMMENT "" - ) ENGINE=OLAP - DUPLICATE KEY(`id`) - DISTRIBUTED BY HASH(`id`) BUCKETS 10 - PROPERTIES ( - "colocate_with" = "t1" - ); - - CREATE TABLE `t2` ( - `id` int(11) COMMENT "", - `value` varchar(8) COMMENT "" - ) ENGINE=OLAP - DUPLICATE KEY(`id`) - DISTRIBUTED BY HASH(`id`) BUCKETS 10 - PROPERTIES ( - "colocate_with" = "t1" - ); + ``` + CREATE TABLE example_db.table_range + ( + k1 DATE, + k2 INT, + k3 SMALLINT, + v1 VARCHAR(2048), + v2 DATETIME DEFAULT "2014-02-04 15:36:00" + ) + ENGINE=olap + DUPLICATE KEY(k1, k2, k3) + PARTITION BY RANGE (k1) + ( + PARTITION p1 VALUES LESS THAN ("2014-01-01"), + PARTITION p2 VALUES LESS THAN ("2014-06-01"), + PARTITION p3 VALUES LESS THAN ("2014-12-01") + ) + DISTRIBUTED BY HASH(k2) BUCKETS 32 + PROPERTIES( + "storage_medium" = "SSD", "storage_cooldown_time" = "2015-06-04 00:00:00" + ); + ``` - 9. 创建一个数据文件存储在BOS上的 broker 外部表 - CREATE EXTERNAL TABLE example_db.table_broker ( - k1 DATE - ) - ENGINE=broker - PROPERTIES ( - "broker_name" = "bos", - "path" = "bos://my_bucket/input/file", - ) - BROKER PROPERTIES ( - "bos_endpoint" = "http://bj.bcebos.com", - "bos_accesskey" = "xxxxxxxxxxxxxxxxxxxxxxxxxx", - "bos_secret_accesskey"="yyyyyyyyyyyyyyyyyyyy" - ) + 说明: + 这个语句会将数据划分成如下3个分区: + + ``` + ( { MIN }, {"2014-01-01"} ) + [ {"2014-01-01"}, {"2014-06-01"} ) + [ {"2014-06-01"}, {"2014-12-01"} ) + ``` + + 不在这些分区范围内的数据将视为非法数据被过滤 + + 2) Fixed Range + + ``` + CREATE TABLE table_range + ( + k1 DATE, + k2 INT, + k3 SMALLINT, + v1 VARCHAR(2048), + v2 DATETIME DEFAULT "2014-02-04 15:36:00" + ) + ENGINE=olap + DUPLICATE KEY(k1, k2, k3) + PARTITION BY RANGE (k1, k2, k3) + ( + PARTITION p1 VALUES [("2014-01-01", "10", "200"), ("2014-01-01", "20", "300")), + PARTITION p2 VALUES [("2014-06-01", "100", "200"), ("2014-07-01", "100", "300")) + ) + DISTRIBUTED BY HASH(k2) BUCKETS 32 + PROPERTIES( + "storage_medium" = "SSD" + ); + ``` + +4. 创建一个 mysql 表 + +``` + CREATE TABLE example_db.table_mysql + ( + k1 DATE, + k2 INT, + k3 SMALLINT, + k4 VARCHAR(2048), + k5 DATETIME + ) + ENGINE=mysql + PROPERTIES + ( + "host" = "127.0.0.1", + "port" = "8239", + "user" = "mysql_user", + "password" = "mysql_passwd", + "database" = "mysql_db_test", + "table" = "mysql_table_test" + ) +``` + +5. 创建一个数据文件存储在HDFS上的 broker 外部表, 数据使用 "|" 分割,"\n" 换行 + +``` + CREATE EXTERNAL TABLE example_db.table_broker ( + k1 DATE, + k2 INT, + k3 SMALLINT, + k4 VARCHAR(2048), + k5 DATETIME + ) + ENGINE=broker + PROPERTIES ( + "broker_name" = "hdfs", + "path" = "hdfs://hdfs_host:hdfs_port/data1,hdfs://hdfs_host:hdfs_port/data2,hdfs://hdfs_host:hdfs_port/data3%2c4", + "column_separator" = "|", + "line_delimiter" = "\n" + ) + BROKER PROPERTIES ( + "username" = "hdfs_user", + "password" = "hdfs_password" + ) +``` + +6. 创建一张含有HLL列的表 + +``` + CREATE TABLE example_db.example_table + ( + k1 TINYINT, + k2 DECIMAL(10, 2) DEFAULT "10.5", + v1 HLL HLL_UNION, + v2 HLL HLL_UNION + ) + ENGINE=olap + AGGREGATE KEY(k1, k2) + DISTRIBUTED BY HASH(k1) BUCKETS 32 + PROPERTIES ("storage_type"="column"); +``` + +7. 创建一张含有BITMAP_UNION聚合类型的表(v1和v2列的原始数据类型必须是TINYINT,SMALLINT,INT) + +``` + CREATE TABLE example_db.example_table + ( + k1 TINYINT, + k2 DECIMAL(10, 2) DEFAULT "10.5", + v1 BITMAP BITMAP_UNION, + v2 BITMAP BITMAP_UNION + ) + ENGINE=olap + AGGREGATE KEY(k1, k2) + DISTRIBUTED BY HASH(k1) BUCKETS 32 + PROPERTIES ("storage_type"="column"); +``` + +8. 创建两张支持Colocat Join的表t1 和t2 + +``` + CREATE TABLE `t1` ( + `id` int(11) COMMENT "", + `value` varchar(8) COMMENT "" + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + DISTRIBUTED BY HASH(`id`) BUCKETS 10 + PROPERTIES ( + "colocate_with" = "t1" + ); + + CREATE TABLE `t2` ( + `id` int(11) COMMENT "", + `value` varchar(8) COMMENT "" + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + DISTRIBUTED BY HASH(`id`) BUCKETS 10 + PROPERTIES ( + "colocate_with" = "t1" + ); +``` + +9. 创建一个数据文件存储在BOS上的 broker 外部表 + +``` + CREATE EXTERNAL TABLE example_db.table_broker ( + k1 DATE + ) + ENGINE=broker + PROPERTIES ( + "broker_name" = "bos", + "path" = "bos://my_bucket/input/file", + ) + BROKER PROPERTIES ( + "bos_endpoint" = "http://bj.bcebos.com", + "bos_accesskey" = "xxxxxxxxxxxxxxxxxxxxxxxxxx", + "bos_secret_accesskey"="yyyyyyyyyyyyyyyyyyyy" + ) +``` + +10. 创建一个带有bitmap 索引的表 + +``` + CREATE TABLE example_db.table_hash + ( + k1 TINYINT, + k2 DECIMAL(10, 2) DEFAULT "10.5", + v1 CHAR(10) REPLACE, + v2 INT SUM, + INDEX k1_idx (k1) USING BITMAP COMMENT 'xxxxxx' + ) + ENGINE=olap + AGGREGATE KEY(k1, k2) + COMMENT "my first doris table" + DISTRIBUTED BY HASH(k1) BUCKETS 32 + PROPERTIES ("storage_type"="column"); +``` ## keyword + CREATE,TABLE - diff --git a/docs/documentation/cn/sql-reference/sql-statements/Data Definition/DROP INDEX.md b/docs/documentation/cn/sql-reference/sql-statements/Data Definition/DROP INDEX.md new file mode 100644 index 00000000000000..5ed5a78b640d9f --- /dev/null +++ b/docs/documentation/cn/sql-reference/sql-statements/Data Definition/DROP INDEX.md @@ -0,0 +1,30 @@ + + +# DROP INDEX + +## description + + 该语句用于从一个表中删除指定名称的索引,目前仅支持bitmap 索引 + 语法: + DROP INDEX index_name ON [db_name.]table_name; + +## keyword + + DROP,INDEX diff --git a/docs/documentation/cn/sql-reference/sql-statements/Data Manipulation/SHOW PARTITIONS.md b/docs/documentation/cn/sql-reference/sql-statements/Data Manipulation/SHOW PARTITIONS.md index be01915cbc0384..4e2b8552ecd69f 100644 --- a/docs/documentation/cn/sql-reference/sql-statements/Data Manipulation/SHOW PARTITIONS.md +++ b/docs/documentation/cn/sql-reference/sql-statements/Data Manipulation/SHOW PARTITIONS.md @@ -21,15 +21,18 @@ under the License. ## description 该语句用于展示分区信息 语法: - SHOW PARTITIONS FROM [db_name.]table_name [PARTITION partition_name]; + SHOW PARTITIONS FROM [db_name.]table_name [WHERE] [ORDER BY] [LIMIT]; + 说明: + 支持PartitionId,PartitionName,State,Buckets,ReplicationNum,LastConsistencyCheckTime等列的过滤 ## example - 1. 展示指定 db 的下指定表的分区信息 + 1.展示指定db下指定表的所有分区信息 SHOW PARTITIONS FROM example_db.table_name; - 1. 展示指定 db 的下指定表的指定分区的信息 - SHOW PARTITIONS FROM example_db.table_name PARTITION p1; - + 2.展示指定db下指定表的指定分区的信息 + SHOW PARTITIONS FROM example_db.table_name WHERE PartitionName = "p1"; + + 3.展示指定db下指定表的最新分区的信息 + SHOW PARTITIONS FROM example_db.table_name ORDER BY PartitionId DESC LIMIT 1; ## keyword SHOW,PARTITIONS - diff --git a/docs/documentation/en/getting-started/data-model-rollup_EN.md b/docs/documentation/en/getting-started/data-model-rollup_EN.md index 8a54b4c28d8905..2ac0a8e1b89a57 100644 --- a/docs/documentation/en/getting-started/data-model-rollup_EN.md +++ b/docs/documentation/en/getting-started/data-model-rollup_EN.md @@ -17,7 +17,6 @@ specific language governing permissions and limitations under the License. --> - # Data Model, ROLLUP and Prefix Index This document describes Doris's data model, ROLLUP and prefix index concepts at the logical level to help users better use Doris to cope with different business scenarios. @@ -31,7 +30,7 @@ Columns can be divided into two categories: Key and Value. From a business persp Doris's data model is divided into three main categories: -*Aggregate +* Aggregate * Uniq * Duplicate @@ -45,7 +44,7 @@ We illustrate what aggregation model is and how to use it correctly with practic Assume that the business has the following data table schema: -Columns +|ColumnName|Type|AggregationType|Comment| |---|---|---|---| | userid | LARGEINT | | user id| | date | DATE | | date of data filling| @@ -62,27 +61,27 @@ If converted into a table-building statement, the following is done (omitting th ``` CREATE TABLE IF NOT EXISTS example_db.expamle_tbl ( -`user_id` LARGEINT NOT NULL COMMENT "用户id", -"Date `date not null how `index `Fufu 8;'Back -` City `VARCHAR (20) COMMENT `User City', -"Age" SMALLINT COMMENT "29992;" 25143;"24180;" 40836 ", -`sex` TINYINT COMMENT "用户性别", -"last visit date" DATETIME REPLACE DEFAULT "1970 -01 -01 00:00" COMMENT "25143;" 27425;"35775;" 3838382", -`cost` BIGINT SUM DEFAULT "0" COMMENT "用户总消费", -Best Answer: Best Answer -How about "99999" as time goes by??????????????????????????????????????????????????????????????????????????????????????????? + `user_id` LARGEINT NOT NULL COMMENT "user id", + `date` DATE NOT NULL COMMENT "data import time", + `city` VARCHAR(20) COMMENT "city", + `age` SMALLINT COMMENT "age", + `sex` TINYINT COMMENT "gender", + `last_visit_date` DATETIME REPLACE DEFAULT "1970-01-01 00:00:00" COMMENT "last visit date time", + `cost` BIGINT SUM DEFAULT "0" COMMENT "user total cost", + `max_dwell_time` INT MAX DEFAULT "0" COMMENT "user max dwell time", + `min_dwell_time` INT MIN DEFAULT "99999" COMMENT "user min dwell time", ) AGGREGATE KEY(`user_id`, `date`, `timestamp`, `city`, `age`, `sex`) -... /* 省略 Partition 和 Distribution 信息 */ +... /* ignore Partition and Distribution */ ; ``` As you can see, this is a typical fact table of user information and access behavior. In general star model, user information and access behavior are stored in dimension table and fact table respectively. Here, in order to explain Doris's data model more conveniently, we store the two parts of information in a single table. -The columns in the table are divided into Key (dimension column) and Value (indicator column) according to whether `AggregationType'is set or not. No `AggregationType', such as `user_id', `date', `age', etc., is set as ** Key **, while `AggregationType'is set as ** Value **. +The columns in the table are divided into Key (dimension column) and Value (indicator column) according to whether `AggregationType`is set or not. No `AggregationType`, such as `user_id`, `date`, `age`, etc., is set as **Key**, while AggregationType'is set as **Value**. -When we import data, the same rows and aggregates into one row for the Key column, while the Value column aggregates according to the set `AggregationType'. ` AggregationType `currently has the following four ways of aggregation: +When we import data, the same rows and aggregates into one row for the Key column, while the Value column aggregates according to the set `AggregationType`. `AggregationType`currently has the following four ways of aggregation: 1. SUM: Sum, multi-line Value accumulation. 2. REPLACE: Instead, Values in the next batch of data will replace Values in rows previously imported. @@ -130,12 +129,12 @@ As you can see, there is only one line of aggregated data left for 10,000 users. The first five columns remain unchanged, starting with column 6 `last_visit_date': -*` 2017-10-01 07:00 `: Because the `last_visit_date'column is aggregated by REPLACE, the `2017-10-01 07:00 ` column has been replaced by `2017-10-01 06:00'. +*`2017-10-01 07:00`: Because the `last_visit_date`column is aggregated by REPLACE, the `2017-10-01 07:00` column has been replaced by `2017-10-01 06:00'. > Note: For data in the same import batch, the order of replacement is not guaranteed for the aggregation of REPLACE. For example, in this case, it may be `2017-10-01 06:00'. For data from different imported batches, it can be guaranteed that the data from the latter batch will replace the former batch. -*` 35 `: Because the aggregation type of the `cost'column is SUM, 35 is accumulated from 20 + 15. -*` 10 `: Because the aggregation type of the `max_dwell_time'column is MAX, 10 and 2 take the maximum and get 10. -*` 2 `: Because the aggregation type of `min_dwell_time'column is MIN, 10 and 2 take the minimum value and get 2. +*`35`: Because the aggregation type of the `cost'column is SUM, 35 is accumulated from 20 + 15. +*`10`: Because the aggregation type of the`max_dwell_time'column is MAX, 10 and 2 take the maximum and get 10. +*`2`: Because the aggregation type of `min_dwell_time'column is MIN, 10 and 2 take the minimum value and get 2. After aggregation, Doris ultimately only stores aggregated data. In other words, detailed data will be lost and users can no longer query the detailed data before aggregation. @@ -143,7 +142,7 @@ After aggregation, Doris ultimately only stores aggregated data. In other words, Following example 1, we modify the table structure as follows: -Columns +|ColumnName|Type|AggregationType|Comment| |---|---|---|---| | userid | LARGEINT | | user id| | date | DATE | | date of data filling| @@ -182,7 +181,7 @@ Then when this batch of data is imported into Doris correctly, the final storage | 10004 | 2017-10-01 | 2017-10-01 12:12:48 | Shenzhen | 35 | 0 | 2017-10-01 10:00:15 | 100 | 3 | 3| | 10004 | 2017-10-03 | 2017-10-03 12:38:20 | Shenzhen | 35 | 0 | 2017-10-03 10:20:22 | 11 | 6 | 6| -We can see that the stored data, just like the imported data, does not aggregate at all. This is because, in this batch of data, because the `timestamp'column is added, the Keys of all rows are ** not exactly the same **. That is, as long as the keys of each row are not identical in the imported data, Doris can save the complete detailed data even in the aggregation model. +We can see that the stored data, just like the imported data, does not aggregate at all. This is because, in this batch of data, because the `timestamp'column is added, the Keys of all rows are **not exactly the same**. That is, as long as the keys of each row are not identical in the imported data, Doris can save the complete detailed data even in the aggregation model. ### Example 3: Importing data and aggregating existing data @@ -224,13 +223,13 @@ Data aggregation occurs in Doris in the following three stages: 2. The stage in which the underlying BE performs data Compaction. At this stage, BE aggregates data from different batches that have been imported. 3. Data query stage. In data query, the data involved in the query will be aggregated accordingly. -Data may be aggregated to varying degrees at different times. For example, when a batch of data is just imported, it may not be aggregated with the existing data. But for users, user** can only query aggregated data. That is, different degrees of aggregation are transparent to user queries. Users should always assume that data exists in terms of the degree of aggregation that ** ultimately completes, and ** should not assume that some aggregation has not yet occurred **. (See the section ** Limitations of the aggregation model ** for more details.) +Data may be aggregated to varying degrees at different times. For example, when a batch of data is just imported, it may not be aggregated with the existing data. But for users, user**can only query aggregated data**. That is, different degrees of aggregation are transparent to user queries. Users should always assume that data exists in terms of the degree of aggregation that **ultimately completes**, and **should not assume that some aggregation has not yet occurred**. (See the section **Limitations of the aggregation model** for more details.) ## Uniq Model In some multi-dimensional analysis scenarios, users are more concerned with how to ensure the uniqueness of Key, that is, how to obtain the Primary Key uniqueness constraint. Therefore, we introduce Uniq's data model. This model is essentially a special case of aggregation model and a simplified representation of table structure. Let's give an example. -Columns +|ColumnName|Type|IsKey|Comment| |---|---|---|---| | user_id | BIGINT | Yes | user id| | username | VARCHAR (50) | Yes | User nickname| @@ -262,7 +261,7 @@ Unique Key ("User", "User", "Name") This table structure is exactly the same as the following table structure described by the aggregation model: -Columns +|ColumnName|Type|AggregationType|Comment| |---|---|---|---| | user_id | BIGINT | | user id| | username | VARCHAR (50) | | User nickname| @@ -298,17 +297,16 @@ That is to say, Uniq model can be completely replaced by REPLACE in aggregation In some multidimensional analysis scenarios, data has neither primary keys nor aggregation requirements. Therefore, we introduce Duplicate data model to meet this kind of demand. Examples are given. -+ 124; Columname = 124; type = 124; sortkey = 124; comment = 124; +|ColumnName|Type|SortKey|Comment| |---|---|---|---| | Timstamp | DATETIME | Yes | Logging Time| | Type | INT | Yes | Log Type| -|error_code|INT|Yes|错误码| +|error_code|INT|Yes|error code| | Error_msg | VARCHAR (1024) | No | Error Details| -1.2.2.2.;2.2.2.1.;2.2.2.2.2.2.2.2.2.2. -| op_time | DATETIME | No | Processing time| +|op_id|BIGINT|No|operator id| +|op_time|DATETIME|No|operation time| The TABLE statement is as follows: - ``` CREATE TABLE IF NOT EXISTS example_db.expamle_tbl ( @@ -337,7 +335,7 @@ ROLLUP in multidimensional analysis means "scroll up", which means that data is In Doris, we make the table created by the user through the table building statement a Base table. Base table holds the basic data stored in the way specified by the user's table-building statement. -On top of the Base table, we can create any number of ROLLUP tables. These ROLLUP data are generated based on the Base table and physically ** stored independently **. +On top of the Base table, we can create any number of ROLLUP tables. These ROLLUP data are generated based on the Base table and physically **stored independently**. The basic function of ROLLUP tables is to obtain coarser aggregated data on the basis of Base tables. @@ -349,9 +347,9 @@ Because Uniq is only a special case of the Aggregate model, we do not distinguis Example 1: Get the total consumption per user -Following ** Example 2 ** in the ** Aggregate Model ** section, the Base table structure is as follows: +Following **Example 2** in the **Aggregate Model** section, the Base table structure is as follows: -Columns +|ColumnName|Type|AggregationType|Comment| |---|---|---|---| | user_id | LARGEINT | | user id| | date | DATE | | date of data filling| @@ -378,7 +376,7 @@ The data stored are as follows: On this basis, we create a ROLLUP: -1240; Colonname 12412; +|ColumnName| |---| |user_id| |cost| @@ -403,7 +401,7 @@ Doris automatically hits the ROLLUP table, thus completing the aggregated query Follow example 1. Based on the Base table, we create a ROLLUP: -Columns +|ColumnName|Type|AggregationType|Comment| |---|---|---|---| | City | VARCHAR (20) | | User City| | age | SMALLINT | | User age| @@ -448,23 +446,23 @@ We use the prefix index of ** 36 bytes ** of a row of data as the prefix index o 1. The prefix index of the following table structure is user_id (8Byte) + age (8Bytes) + message (prefix 20 Bytes). -+ 124; Columname = 124; type = 124; +|ColumnName|Type| |---|---| |user_id|BIGINT| |age|INT| -Message -124max \\u dwell u team 124DATE -124m;min \\u dwell u team 124DATE +|message|VARCHAR(100)| +|max\_dwell\_time|DATETIME| +|min\_dwell\_time|DATETIME| 2. The prefix index of the following table structure is user_name (20 Bytes). Even if it does not reach 36 bytes, because it encounters VARCHAR, it truncates directly and no longer continues. -+ 124; Columname = 124; type = 124; +|ColumnName|Type| |---|---| -User name +|user_name|VARCHAR(20)| |age|INT| -Message -124max \\u dwell u team 124DATE -124m;min \\u dwell u team 124DATE +|message|VARCHAR(100)| +|max\_dwell\_time|DATETIME| +|min\_dwell\_time|DATETIME| When our query condition is the prefix of ** prefix index **, it can greatly speed up the query speed. For example, in the first example, we execute the following queries: @@ -482,23 +480,23 @@ Because column order is specified when a table is built, there is only one prefi The structure of the Base table is as follows: -+ 124; Columname = 124; type = 124; +|ColumnName|Type| |---|---| |user\_id|BIGINT| |age|INT| -Message -124max \\u dwell u team 124DATE -124m;min \\u dwell u team 124DATE +|message|VARCHAR(100)| +|max\_dwell\_time|DATETIME| +|min\_dwell\_time|DATETIME| On this basis, we can create a ROLLUP table: -+ 124; Columname = 124; type = 124; +|ColumnName|Type| |---|---| |age|INT| |user\_id|BIGINT| -Message -124max \\u dwell u team 124DATE -124m;min \\u dwell u team 124DATE +|message|VARCHAR(100)| +|max\_dwell\_time|DATETIME| +|min\_dwell\_time|DATETIME| As you can see, the columns of ROLLUP and Base tables are exactly the same, just changing the order of user_id and age. So when we do the following query: @@ -514,9 +512,9 @@ The ROLLUP table is preferred because the prefix index of ROLLUP matches better. * Data updates for ROLLUP are fully synchronized with Base representations. Users need not care about this problem. * Columns in ROLLUP are aggregated in exactly the same way as Base tables. There is no need to specify or modify ROLLUP when creating it. * A necessary (inadequate) condition for a query to hit ROLLUP is that all columns ** (including the query condition columns in select list and where) involved in the query exist in the column of the ROLLUP. Otherwise, the query can only hit the Base table. -* Certain types of queries (such as count (*)) cannot hit ROLLUP under any conditions. See the next section ** Limitations of the aggregation model **. +* Certain types of queries (such as count (*)) cannot hit ROLLUP under any conditions. See the next section **Limitations of the aggregation model**. * The query execution plan can be obtained by `EXPLAIN your_sql;` command, and in the execution plan, whether ROLLUP has been hit or not can be checked. -* Base tables and all created ROLLUPs can be displayed by `DESC tbl_name ALL'; `statement. +* Base tables and all created ROLLUPs can be displayed by `DESC tbl_name ALL;` statement. In this document, you can see [Query how to hit Rollup] (hit-the-rollup) @@ -528,7 +526,7 @@ In the aggregation model, what the model presents is the aggregated data. That i The hypothesis table is structured as follows: -Columns +|ColumnName|Type|AggregationType|Comment| |---|---|---|---| | userid | LARGEINT | | user id| | date | DATE | | date of data filling| @@ -602,22 +600,22 @@ Because the final aggregation result is: |10002|2017-11-21|39| |10003|2017-11-22|22| -So `select count (*) from table; `The correct result should be ** 4 **. But if we only scan the `user_id'column and add query aggregation, the final result is ** 3 ** (10001, 10002, 10003). If aggregated without queries, the result is ** 5 ** (a total of five rows in two batches). It can be seen that both results are wrong. +So `select count (*) from table;` The correct result should be **4**. But if we only scan the `user_id'column and add query aggregation, the final result is **3** (10001, 10002, 10003). If aggregated without queries, the result is **5** (a total of five rows in two batches). It can be seen that both results are wrong. -In order to get the correct result, we must read the data of `user_id'and `date', and ** together with aggregate ** when querying, to return the correct result of ** 4 **. That is to say, in the count (*) query, Doris must scan all AGGREGATE KEY columns (here are `user_id` and `date') and aggregate them to get the semantically correct results. When aggregated columns are large, count (*) queries need to scan a large amount of data. +In order to get the correct result, we must read the data of `user_id` and `date`, and **together with aggregate** when querying, to return the correct result of **4**. That is to say, in the count (*) query, Doris must scan all AGGREGATE KEY columns (here are `user_id` and `date`) and aggregate them to get the semantically correct results. When aggregated columns are large, count (*) queries need to scan a large amount of data. -Therefore, when there are frequent count (*) queries in the business, we recommend that users simulate count (*)**) by adding a column with a ** value of 1 and aggregation type of SUM. As the table structure in the previous example, we modify it as follows: +Therefore, when there are frequent count (*) queries in the business, we recommend that users simulate count (*) by adding a column with a value of 1 and aggregation type of SUM. As the table structure in the previous example, we modify it as follows: -Columns +|ColumnName|Type|AggregationType|Comment| |---|---|---|---| | user ID | BIGINT | | user id| | date | DATE | | date of data filling| | Cost | BIGINT | SUM | Total User Consumption| | count | BIGINT | SUM | for counting| -Add a count column and import the data with the column value ** equal to 1 **. The result of `select count (*) from table; `is equivalent to `select sum (count) from table; ` The query efficiency of the latter is much higher than that of the former. However, this method also has limitations, that is, users need to guarantee that they will not import rows with the same AGGREGATE KEY column repeatedly. Otherwise, `select sum (count) from table; `can only express the number of rows originally imported, not the semantics of `select count (*) from table; ` +Add a count column and import the data with the column value **equal to 1**. The result of `select count (*) from table;`is equivalent to `select sum (count) from table;` The query efficiency of the latter is much higher than that of the former. However, this method also has limitations, that is, users need to guarantee that they will not import rows with the same AGGREGATE KEY column repeatedly. Otherwise, `select sum (count) from table;`can only express the number of rows originally imported, not the semantics of `select count (*) from table;` -Another way is to ** change the aggregation type of the `count'column above to REPLACE, and still weigh 1 **. Then `select sum (count) from table; `and `select count (*) from table; `the results will be consistent. And in this way, there is no restriction on importing duplicate rows. +Another way is to **change the aggregation type of the count'column above to REPLACE, and still weigh 1**. Then`select sum (count) from table;` and `select count (*) from table;` the results will be consistent. And in this way, there is no restriction on importing duplicate rows. ### Duplicate Model @@ -625,7 +623,7 @@ Duplicate model has no limitation of aggregation model. Because the model does n ## Suggestions for Choosing Data Model -Because the data model was established when the table was built, and ** could not be modified **. Therefore, it is very important to select an appropriate data model **. +Because the data model was established when the table was built, and **could not be modified **. Therefore, it is very important to select an appropriate data model**. 1. Aggregate model can greatly reduce the amount of data scanned and the amount of query computation by pre-aggregation. It is very suitable for report query scenarios with fixed patterns. But this model is not very friendly for count (*) queries. At the same time, because the aggregation method on the Value column is fixed, semantic correctness should be considered in other types of aggregation queries. 2. Uniq model guarantees the uniqueness of primary key for scenarios requiring unique primary key constraints. However, the query advantage brought by pre-aggregation such as ROLLUP can not be exploited (because the essence is REPLACE, there is no such aggregation as SUM). diff --git a/docs/documentation/en/getting-started/data-partition_EN.md b/docs/documentation/en/getting-started/data-partition_EN.md index 111ad28e449772..7d809c5cbc5e77 100644 --- a/docs/documentation/en/getting-started/data-partition_EN.md +++ b/docs/documentation/en/getting-started/data-partition_EN.md @@ -17,7 +17,6 @@ specific language governing permissions and limitations under the License. --> - # Data Partition This document mainly introduces Doris's table construction and data partitioning, as well as problems and solutions that may be encountered in the construction of the table. @@ -53,31 +52,31 @@ This section introduces Doris's approach to building tables with an example. ``` CREATE TABLE IF NOT EXISTS example_db.expamle_tbl ( -    `user_id` LARGEINT NOT NULL COMMENT "user id", -    `date` DATE NOT NULL COMMENT "Data fill in date time", -    `timestamp` DATETIME NOT NULL COMMENT "Timestamp of data being poured", -    `city` VARCHAR(20) COMMENT "The city where the user is located", -    `age` SMALLINT COMMENT "user age", -    `sex` TINYINT COMMENT "User Gender", -    `last_visit_date` DATETIME REPLACE DEFAULT "1970-01-01 00:00:00" COMMENT "User last visit time", -    `cost` BIGINT SUM DEFAULT "0" COMMENT "Total user consumption", -    `max_dwell_time` INT MAX DEFAULT "0" COMMENT "User maximum dwell time", -    `min_dwell_time` INT MIN DEFAULT "99999" COMMENT "User minimum dwell time" + `user_id` LARGEINT NOT NULL COMMENT "user id", + `date` DATE NOT NULL COMMENT "Data fill in date time", + `timestamp` DATETIME NOT NULL COMMENT "Timestamp of data being poured", + `city` VARCHAR(20) COMMENT "The city where the user is located", + `age` SMALLINT COMMENT "user age", + `sex` TINYINT COMMENT "User Gender", + `last_visit_date` DATETIME REPLACE DEFAULT "1970-01-01 00:00:00" COMMENT "User last visit time", + `cost` BIGINT SUM DEFAULT "0" COMMENT "Total user consumption", + `max_dwell_time` INT MAX DEFAULT "0" COMMENT "User maximum dwell time", + `min_dwell_time` INT MIN DEFAULT "99999" COMMENT "User minimum dwell time" ) ENGINE=olap AGGREGATE KEY(`user_id`, `date`, `timestamp`, `city`, `age`, `sex`) PARTITION BY RANGE(`date`) ( -    PARTITION `p201701` VALUES LESS THAN ("2017-02-01"), -    PARTITION `p201702` VALUES LESS THAN ("2017-03-01"), -    PARTITION `p201703` VALUES LESS THAN ("2017-04-01") + PARTITION `p201701` VALUES LESS THAN ("2017-02-01"), + PARTITION `p201702` VALUES LESS THAN ("2017-03-01"), + PARTITION `p201703` VALUES LESS THAN ("2017-04-01") ) DISTRIBUTED BY HASH(`user_id`) BUCKETS 16 PROPERTIES ( -    "replication_num" = "3", -    "storage_medium" = "SSD", -    "storage_cooldown_time" = "2018-01-01 12:00:00" + "replication_num" = "3", + "storage_medium" = "SSD", + "storage_cooldown_time" = "2018-01-01 12:00:00" ); ``` @@ -106,98 +105,93 @@ It is also possible to use only one layer of partitioning. When using a layer pa 1. Partition - * The Partition column can specify one or more columns. The partition class must be a KEY column. The use of multi-column partitions is described later in the **Multi-column partitioning** summary. -     + * The Partition column can specify one or more columns. The partition class must be a KEY column. The use of multi-column partitions is described later in the **Multi-column partitioning** summary.  * Regardless of the type of partition column, double quotes are required when writing partition values. * Partition columns are usually time columns for easy management of old and new data. * There is no theoretical limit on the number of partitions. * When you do not use Partition to build a table, the system will automatically generate a Partition with the same name as the table name. This Partition is not visible to the user and cannot be modified. * Partition supports only the upper bound by `VALUES LESS THAN (...)`, the system will use the upper bound of the previous partition as the lower bound of the partition, and generate a left closed right open interval. Passing, also supports specifying the upper and lower bounds by `VALUES [...)`, and generating a left closed right open interval. - * It is easier to understand by specifying `VALUES [...)`. Here is an example of the change in partition range when adding or deleting partitions using the `VALUES LESS THAN (...)` statement: -     * As the example above, when the table is built, the following 3 partitions are automatically generated: + ``` + P201701: [MIN_VALUE, 2017-02-01) + P201702: [2017-02-01, 2017-03-01) + P201703: [2017-03-01, 2017-04-01) + ``` + * When we add a partition p201705 VALUES LESS THAN ("2017-06-01"), the partition results are as follows: -            ``` -            P201701: [MIN_VALUE, 2017-02-01) -            P201702: [2017-02-01, 2017-03-01) -            P201703: [2017-03-01, 2017-04-01) -            ``` -         -        * When we add a partition p201705 VALUES LESS THAN ("2017-06-01"), the partition results are as follows: - -            ``` -            P201701: [MIN_VALUE, 2017-02-01) -            P201702: [2017-02-01, 2017-03-01) -            P201703: [2017-03-01, 2017-04-01) -            P201705: [2017-04-01, 2017-06-01) -            ``` -             -        * At this point we delete the partition p201703, the partition results are as follows: -         -            ``` -            p201701: [MIN_VALUE, 2017-02-01) -            p201702: [2017-02-01, 2017-03-01) -            p201705: [2017-04-01, 2017-06-01) -            ``` -             -            > Note that the partition range of p201702 and p201705 has not changed, and there is a hole between the two partitions: [2017-03-01, 2017-04-01). That is, if the imported data range is within this hole, it cannot be imported. -             -        * Continue to delete partition p201702, the partition results are as follows: -         -            ``` -            p201701: [MIN_VALUE, 2017-02-01) -            p201705: [2017-04-01, 2017-06-01) -            The void range becomes: [2017-02-01, 2017-04-01) -            ``` -             -        * Now add a partition p201702new VALUES LESS THAN ("2017-03-01"), the partition results are as follows: -             -            ``` -            p201701: [MIN_VALUE, 2017-02-01) -            p201702new: [2017-02-01, 2017-03-01) -            p201705: [2017-04-01, 2017-06-01) -            ``` -             -            > You can see that the hole size is reduced to: [2017-03-01, 2017-04-01) -             -        * Now delete partition p201701 and add partition p201612 VALUES LESS THAN ("2017-01-01"), the partition result is as follows: - -            ``` -            p201612: [MIN_VALUE, 2017-01-01) -            p201702new: [2017-02-01, 2017-03-01) -            p201705: [2017-04-01, 2017-06-01) ``` -             + P201701: [MIN_VALUE, 2017-02-01) + P201702: [2017-02-01, 2017-03-01) + P201703: [2017-03-01, 2017-04-01) + P201705: [2017-04-01, 2017-06-01) + ``` + + * At this point we delete the partition p201703, the partition results are as follows: + + ``` + p201701: [MIN_VALUE, 2017-02-01) + p201702: [2017-02-01, 2017-03-01) + p201705: [2017-04-01, 2017-06-01) + ``` + + > Note that the partition range of p201702 and p201705 has not changed, and there is a hole between the two partitions: [2017-03-01, 2017-04-01). That is, if the imported data range is within this hole, it cannot be imported. + + * Continue to delete partition p201702, the partition results are as follows: + + ``` + p201701: [MIN_VALUE, 2017-02-01) + p201705: [2017-04-01, 2017-06-01) + The void range becomes: [2017-02-01, 2017-04-01) + ``` + + * Now add a partition p201702new VALUES LESS THAN ("2017-03-01"), the partition results are as follows: + + ``` + p201701: [MIN_VALUE, 2017-02-01) + p201702new: [2017-02-01, 2017-03-01) + p201705: [2017-04-01, 2017-06-01) + ``` + + > You can see that the hole size is reduced to: [2017-03-01, 2017-04-01) + + * Now delete partition p201701 and add partition p201612 VALUES LESS THAN ("2017-01-01"), the partition result is as follows: + + ``` + p201612: [MIN_VALUE, 2017-01-01) + p201702new: [2017-02-01, 2017-03-01) + p201705: [2017-04-01, 2017-06-01) + ``` + > A new void appeared: [2017-01-01, 2017-02-01) -         -    In summary, the deletion of a partition does not change the scope of an existing partition. There may be holes in deleting partitions. When a partition is added by the `VALUES LESS THAN` statement, the lower bound of the partition immediately follows the upper bound of the previous partition. -     -    You cannot add partitions with overlapping ranges. + + In summary, the deletion of a partition does not change the scope of an existing partition. There may be holes in deleting partitions. When a partition is added by the `VALUES LESS THAN` statement, the lower bound of the partition immediately follows the upper bound of the previous partition. + + You cannot add partitions with overlapping ranges. 2. Bucket -    * If a Partition is used, the `DISTRIBUTED ...` statement describes the division rules for the data in each partition. If you do not use Partition, it describes the rules for dividing the data of the entire table. -    * The bucket column can be multiple columns, but it must be a Key column. The bucket column can be the same or different from the Partition column. -    * The choice of bucket column is a trade-off between **query throughput** and **query concurrency**: + * If a Partition is used, the `DISTRIBUTED ...` statement describes the division rules for the data in each partition. If you do not use Partition, it describes the rules for dividing the data of the entire table. + * The bucket column can be multiple columns, but it must be a Key column. The bucket column can be the same or different from the Partition column. + * The choice of bucket column is a trade-off between **query throughput** and **query concurrency**: -        1. If you select multiple bucket columns, the data is more evenly distributed. However, if the query condition does not include the equivalent condition for all bucket columns, a query will scan all buckets. The throughput of such queries will increase, but the latency of a single query will increase. This method is suitable for large throughput and low concurrent query scenarios. -        2. If you select only one or a few bucket columns, the point query can query only one bucket. This approach is suitable for high-concurrency point query scenarios. -         -    * There is no theoretical limit on the number of buckets. + 1. If you select multiple bucket columns, the data is more evenly distributed. However, if the query condition does not include the equivalent condition for all bucket columns, a query will scan all buckets. The throughput of such queries will increase, but the latency of a single query will increase. This method is suitable for large throughput and low concurrent query scenarios. + 2. If you select only one or a few bucket columns, the point query can query only one bucket. This approach is suitable for high-concurrency point query scenarios. + + * There is no theoretical limit on the number of buckets. 3. Recommendations on the number and amount of data for Partitions and Buckets. -    * The total number of tablets in a table is equal to (Partition num * Bucket num). -    * The number of tablets in a table, which is slightly more than the number of disks in the entire cluster, regardless of capacity expansion. -    * The data volume of a single tablet does not theoretically have an upper and lower bound, but is recommended to be in the range of 1G - 10G. If the amount of data for a single tablet is too small, the aggregation of the data is not good and the metadata management pressure is high. If the amount of data is too large, it is not conducive to the migration, completion, and increase the cost of Schema Change or Rollup operation failure retry (the granularity of these operations failure retry is Tablet). -    * When the tablet's data volume principle and quantity principle conflict, it is recommended to prioritize the data volume principle. -    * When building a table, the number of Buckets for each partition is uniformly specified. However, when dynamically increasing partitions (`ADD PARTITION`), you can specify the number of Buckets for the new partition separately. This feature can be used to easily reduce or expand data. -    * Once the number of Buckets for a Partition is specified, it cannot be changed. Therefore, when determining the number of Buckets, you need to consider the expansion of the cluster in advance. For example, there are currently only 3 hosts, and each host has 1 disk. If the number of Buckets is only set to 3 or less, then even if you add more machines later, you can't increase the concurrency. -    * Give some examples: Suppose there are 10 BEs, one for each BE disk. If the total size of a table is 500MB, you can consider 4-8 shards. 5GB: 8-16. 50GB: 32. 500GB: Recommended partitions, each partition is about 50GB in size, with 16-32 shards per partition. 5TB: Recommended partitions, each with a size of around 50GB and 16-32 shards per partition. -     -    > Note: The amount of data in the table can be viewed by the `show data` command. The result is divided by the number of copies, which is the amount of data in the table. -     + * The total number of tablets in a table is equal to (Partition num * Bucket num). + * The number of tablets in a table, which is slightly more than the number of disks in the entire cluster, regardless of capacity expansion. + * The data volume of a single tablet does not theoretically have an upper and lower bound, but is recommended to be in the range of 1G - 10G. If the amount of data for a single tablet is too small, the aggregation of the data is not good and the metadata management pressure is high. If the amount of data is too large, it is not conducive to the migration, completion, and increase the cost of Schema Change or Rollup operation failure retry (the granularity of these operations failure retry is Tablet). + * When the tablet's data volume principle and quantity principle conflict, it is recommended to prioritize the data volume principle. + * When building a table, the number of Buckets for each partition is uniformly specified. However, when dynamically increasing partitions (`ADD PARTITION`), you can specify the number of Buckets for the new partition separately. This feature can be used to easily reduce or expand data. + * Once the number of Buckets for a Partition is specified, it cannot be changed. Therefore, when determining the number of Buckets, you need to consider the expansion of the cluster in advance. For example, there are currently only 3 hosts, and each host has 1 disk. If the number of Buckets is only set to 3 or less, then even if you add more machines later, you can't increase the concurrency. + * Give some examples: Suppose there are 10 BEs, one for each BE disk. If the total size of a table is 500MB, you can consider 4-8 shards. 5GB: 8-16. 50GB: 32. 500GB: Recommended partitions, each partition is about 50GB in size, with 16-32 shards per partition. 5TB: Recommended partitions, each with a size of around 50GB and 16-32 shards per partition. + + > Note: The amount of data in the table can be viewed by the `show data` command. The result is divided by the number of copies, which is the amount of data in the table. + #### Multi-column partition Doris supports specifying multiple columns as partition columns, examples are as follows: @@ -205,9 +199,9 @@ Doris supports specifying multiple columns as partition columns, examples are as ``` PARTITION BY RANGE(`date`, `id`) ( -    PARTITION `p201701_1000` VALUES LESS THAN ("2017-02-01", "1000"), -    PARTITION `p201702_2000` VALUES LESS THAN ("2017-03-01", "2000"), -    PARTITION `p201703_all` VALUES LESS THAN ("2017-04-01") + PARTITION `p201701_1000` VALUES LESS THAN ("2017-02-01", "1000"), + PARTITION `p201702_2000` VALUES LESS THAN ("2017-03-01", "2000"), + PARTITION `p201703_all` VALUES LESS THAN ("2017-04-01") ) ``` @@ -240,17 +234,17 @@ In the last PROPERTIES of the table statement, you can specify the following two Replication_num -    * The number of copies per tablet. The default is 3, it is recommended to keep the default. In the build statement, the number of Tablet copies in all Partitions is uniformly specified. When you add a new partition, you can individually specify the number of copies of the tablet in the new partition. -    * The number of copies can be modified at runtime. It is strongly recommended to keep odd numbers. -    * The maximum number of copies depends on the number of independent IPs in the cluster (note that it is not the number of BEs). The principle of replica distribution in Doris is that the copies of the same Tablet are not allowed to be distributed on the same physical machine, and the physical machine is identified as IP. Therefore, even if 3 or more BE instances are deployed on the same physical machine, if the BEs have the same IP, you can only set the number of copies to 1. -    * For some small, and infrequently updated dimension tables, consider setting more copies. In this way, when joining queries, there is a greater probability of local data join. + * The number of copies per tablet. The default is 3, it is recommended to keep the default. In the build statement, the number of Tablet copies in all Partitions is uniformly specified. When you add a new partition, you can individually specify the number of copies of the tablet in the new partition. + * The number of copies can be modified at runtime. It is strongly recommended to keep odd numbers. + * The maximum number of copies depends on the number of independent IPs in the cluster (note that it is not the number of BEs). The principle of replica distribution in Doris is that the copies of the same Tablet are not allowed to be distributed on the same physical machine, and the physical machine is identified as IP. Therefore, even if 3 or more BE instances are deployed on the same physical machine, if the BEs have the same IP, you can only set the number of copies to 1. + * For some small, and infrequently updated dimension tables, consider setting more copies. In this way, when joining queries, there is a greater probability of local data join. 2. storage_medium & storage\_cooldown\_time -    * The BE data storage directory can be explicitly specified as SSD or HDD (differentiated by .SSD or .HDD suffix). When you build a table, you can uniformly specify the media for all Partition initial storage. Note that the suffix is ​​to explicitly specify the disk media without checking to see if it matches the actual media type. -    * The default initial storage medium is HDD. If specified as an SSD, the data is initially stored on the SSD. -    * If storage\_cooldown\_time is not specified, the data is automatically migrated from the SSD to the HDD after 7 days by default. If storage\_cooldown\_time is specified, the data will not migrate until the storage_cooldown_time time is reached. -    * Note that this parameter is just a "best effort" setting when storage_medium is specified. Even if no SSD storage media is set in the cluster, no error is reported and it is automatically stored in the available data directory. Similarly, if the SSD media is inaccessible and out of space, the data may initially be stored directly on other available media. When the data expires and is migrated to the HDD, if the HDD media is inaccessible and there is not enough space, the migration may fail (but will continue to try). + * The BE data storage directory can be explicitly specified as SSD or HDD (differentiated by .SSD or .HDD suffix). When you build a table, you can uniformly specify the media for all Partition initial storage. Note that the suffix is ​​to explicitly specify the disk media without checking to see if it matches the actual media type. + * The default initial storage medium is HDD. If specified as an SSD, the data is initially stored on the SSD. + * If storage\_cooldown\_time is not specified, the data is automatically migrated from the SSD to the HDD after 7 days by default. If storage\_cooldown\_time is specified, the data will not migrate until the storage_cooldown_time time is reached. + * Note that this parameter is just a "best effort" setting when storage_medium is specified. Even if no SSD storage media is set in the cluster, no error is reported and it is automatically stored in the available data directory. Similarly, if the SSD media is inaccessible and out of space, the data may initially be stored directly on other available media. When the data expires and is migrated to the HDD, if the HDD media is inaccessible and there is not enough space, the migration may fail (but will continue to try). ### ENGINE @@ -258,7 +252,7 @@ In this example, the type of ENGINE is olap, the default ENGINE type. In Doris, ### Other -    `IF NOT EXISTS` indicates that if the table has not been created, it is created. Note that only the table name is judged here, and it is not determined whether the new table structure is the same as the existing table structure. So if there is a table with the same name but different structure, the command will also return success, but it does not mean that a new table and a new structure have been created. +`IF NOT EXISTS` indicates that if the table has not been created, it is created. Note that only the table name is judged here, and it is not determined whether the new table structure is the same as the existing table structure. So if there is a table with the same name but different structure, the command will also return success, but it does not mean that a new table and a new structure have been created. ## common problem @@ -266,27 +260,27 @@ In this example, the type of ENGINE is olap, the default ENGINE type. In Doris, 1. If a syntax error occurs in a long build statement, a syntax error may be incomplete. Here is a list of possible syntax errors for manual error correction: -    * The syntax is incorrect. Please read `HELP CREATE TABLE;` carefully to check the relevant syntax structure. -    * Reserved words. When the user-defined name encounters a reserved word, it needs to be enclosed in the backquote ``. It is recommended that all custom names be generated using this symbol. -    * Chinese characters or full-width characters. Non-utf8 encoded Chinese characters, or hidden full-width characters (spaces, punctuation, etc.) can cause syntax errors. It is recommended to check with a text editor with invisible characters. + * The syntax is incorrect. Please read `HELP CREATE TABLE;` carefully to check the relevant syntax structure. + * Reserved words. When the user-defined name encounters a reserved word, it needs to be enclosed in the backquote ``. It is recommended that all custom names be generated using this symbol. + * Chinese characters or full-width characters. Non-utf8 encoded Chinese characters, or hidden full-width characters (spaces, punctuation, etc.) can cause syntax errors. It is recommended to check with a text editor with invisible characters. 2. `Failed to create partition [xxx] . Timeout` -    Doris builds are created in order of Partition granularity. This error may be reported when a Partition creation fails. Even if you don't use Partition, you will report `Failed to create partition` when there is a problem with the built table, because as mentioned earlier, Doris will create an unchangeable default Partition for tables that do not have a Partition specified. -     -    When this error is encountered, it is usually the BE that has encountered problems creating data fragments. You can follow the steps below to troubleshoot: -     -    1. In fe.log, find the `Failed to create partition` log for the corresponding point in time. In this log, a series of numbers like `{10001-10010}` will appear. The first number of the pair is the Backend ID and the second number is the Tablet ID. As for the pair of numbers above, on the Backend with ID 10001, creating a tablet with ID 10010 failed. -    2. Go to the be.INFO log corresponding to Backend and find the log related to the tablet id in the corresponding time period. You can find the error message. -    3. Listed below are some common tablet creation failure errors, including but not limited to: -        * BE did not receive the relevant task, and the tablet id related log could not be found in be.INFO. Or the BE is created successfully, but the report fails. For the above questions, see [Deployment and Upgrade Documentation] to check the connectivity of FE and BE. -        * Pre-allocated memory failed. It may be that the length of a line in a row in the table exceeds 100KB. -        * `Too many open files`. The number of open file handles exceeds the Linux system limit. The handle limit of the Linux system needs to be modified. + Doris builds are created in order of Partition granularity. This error may be reported when a Partition creation fails. Even if you don't use Partition, you will report `Failed to create partition` when there is a problem with the built table, because as mentioned earlier, Doris will create an unchangeable default Partition for tables that do not have a Partition specified. + + When this error is encountered, it is usually the BE that has encountered problems creating data fragments. You can follow the steps below to troubleshoot: -    You can also extend the timeout by setting `tablet_create_timeout_second=xxx` in fe.conf. The default is 2 seconds. + 1. In fe.log, find the `Failed to create partition` log for the corresponding point in time. In this log, a series of numbers like `{10001-10010}` will appear. The first number of the pair is the Backend ID and the second number is the Tablet ID. As for the pair of numbers above, on the Backend with ID 10001, creating a tablet with ID 10010 failed. + 2. Go to the be.INFO log corresponding to Backend and find the log related to the tablet id in the corresponding time period. You can find the error message. + 3. Listed below are some common tablet creation failure errors, including but not limited to: + * BE did not receive the relevant task, and the tablet id related log could not be found in be.INFO. Or the BE is created successfully, but the report fails. For the above questions, see [Deployment and Upgrade Documentation] to check the connectivity of FE and BE. + * Pre-allocated memory failed. It may be that the length of a line in a row in the table exceeds 100KB. + * `Too many open files`. The number of open file handles exceeds the Linux system limit. The handle limit of the Linux system needs to be modified. + + You can also extend the timeout by setting `tablet_create_timeout_second=xxx` in fe.conf. The default is 2 seconds. 3. The build table command does not return results for a long time. -    Doris's table creation command is a synchronous command. The timeout of this command is currently set to be relatively simple, ie (tablet num * replication num) seconds. If you create more data fragments and have fragment creation failed, it may cause an error to be returned after waiting for a long timeout. -     -    Under normal circumstances, the statement will return in a few seconds or ten seconds. If it is more than one minute, it is recommended to cancel this operation directly and go to the FE or BE log to view the related errors. + Doris's table creation command is a synchronous command. The timeout of this command is currently set to be relatively simple, ie (tablet num * replication num) seconds. If you create more data fragments and have fragment creation failed, it may cause an error to be returned after waiting for a long timeout. + + Under normal circumstances, the statement will return in a few seconds or ten seconds. If it is more than one minute, it is recommended to cancel this operation directly and go to the FE or BE log to view the related errors. diff --git a/docs/documentation/en/sql-reference/sql-functions/aggregate-functions/bitmap_EN.md b/docs/documentation/en/sql-reference/sql-functions/aggregate-functions/bitmap_EN.md index 776908297fae91..858382591bec18 100644 --- a/docs/documentation/en/sql-reference/sql-functions/aggregate-functions/bitmap_EN.md +++ b/docs/documentation/en/sql-reference/sql-functions/aggregate-functions/bitmap_EN.md @@ -108,7 +108,9 @@ COUNT (DISTINCT expr) same filter_column The cardinality of the intersection of multiple bitmaps of the filter. bitmap_column_to_count is a column of type bitmap, filter_column is a column of varying dimensions, and filter_values ​​is a list of dimension values +`BITMAP_OR(expr,expr)`: Calculate the or of two Bitmaps Column. The return value is the serialized Bitmap value. +`BITMAP_AND(expr,expr)`:Calculate the and of two Bitmaps Column. The return value is the serialized Bitmap value. note: 1. The parameters of the BITMAP_UNION function currently only support: diff --git a/docs/documentation/en/sql-reference/sql-statements/Administration/SHOW INDEX_EN.md b/docs/documentation/en/sql-reference/sql-statements/Administration/SHOW INDEX_EN.md new file mode 100644 index 00000000000000..bae48c7f6cc4ca --- /dev/null +++ b/docs/documentation/en/sql-reference/sql-statements/Administration/SHOW INDEX_EN.md @@ -0,0 +1,35 @@ + + +# SHOW INDEX + +## description + + This statement is used to show all index(only bitmap index in current version) of a table + 语法: + SHOW INDEX[ES] FROM [db_name.]table_name; + +## example + + 1. dispaly all indexes in table table_name + SHOW INDEX FROM example_db.table_name; + +## keyword + + SHOW,INDEX diff --git a/docs/documentation/en/sql-reference/sql-statements/Data Definition/ALTER TABLE_EN.md b/docs/documentation/en/sql-reference/sql-statements/Data Definition/ALTER TABLE_EN.md index 74c5708078080b..2a31cb70ef422e 100644 --- a/docs/documentation/en/sql-reference/sql-statements/Data Definition/ALTER TABLE_EN.md +++ b/docs/documentation/en/sql-reference/sql-statements/Data Definition/ALTER TABLE_EN.md @@ -18,270 +18,291 @@ under the License. --> # ALTER TABLE + ## description -    This statement is used to modify an existing table. If no rollup index is specified, the base operation is the default. -    The statement is divided into three types of operations: schema change, rollup, partition -    These three types of operations cannot appear in an ALTER TABLE statement at the same time. -    Where schema change and rollup are asynchronous operations and are returned if the task commits successfully. You can then use the SHOW ALTER command to view the progress. -    Partition is a synchronous operation, and a command return indicates that execution is complete. -    grammar: -        ALTER TABLE [database.]table -        Alter_clause1[, alter_clause2, ...]; + This statement is used to modify an existing table. If no rollup index is specified, the base operation is the default. + The statement is divided into three types of operations: schema change, rollup, partition + These three types of operations cannot appear in an ALTER TABLE statement at the same time. + Where schema change and rollup are asynchronous operations and are returned if the task commits successfully. You can then use the SHOW ALTER command to view the progress. + Partition is a synchronous operation, and a command return indicates that execution is complete. + + grammar: + ALTER TABLE [database.]table + Alter_clause1[, alter_clause2, ...]; -    The alter_clause is divided into partition, rollup, schema change, and rename. + The alter_clause is divided into partition, rollup, schema change, rename and bimmap index. -    Partition supports the following modifications -    Increase the partition -        grammar: -            ADD PARTITION [IF NOT EXISTS] partition_name -            Partition_desc ["key"="value"] -            [DISTRIBUTED BY HASH (k1[,k2 ...]) [BUCKETS num]] -        note: -            1) partition_desc supports two ways of writing: -                * VALUES LESS THAN [MAXVALUE|("value1", ...)] -                * VALUES [("value1", ...), ("value1", ...)) -            1) The partition is the left closed right open interval. If the user only specifies the right boundary, the system will automatically determine the left boundary. -            2) If the bucket mode is not specified, the bucket method used by the built-in table is automatically used. -            3) If the bucket mode is specified, only the bucket number can be modified, and the bucket mode or bucket column cannot be modified. -            4) ["key"="value"] section can set some properties of the partition, see CREATE TABLE for details. + Partition supports the following modifications + Increase the partition + grammar: + ADD PARTITION [IF NOT EXISTS] partition_name + Partition_desc ["key"="value"] + [DISTRIBUTED BY HASH (k1[,k2 ...]) [BUCKETS num]] + note: + 1) partition_desc supports two ways of writing: + * VALUES LESS THAN [MAXVALUE|("value1", ...)] + * VALUES [("value1", ...), ("value1", ...)) + 1) The partition is the left closed right open interval. If the user only specifies the right boundary, the system will automatically determine the left boundary. + 2) If the bucket mode is not specified, the bucket method used by the built-in table is automatically used. + 3) If the bucket mode is specified, only the bucket number can be modified, and the bucket mode or bucket column cannot be modified. + 4) ["key"="value"] section can set some properties of the partition, see CREATE TABLE for details. -    2. Delete the partition -        grammar: -            DROP PARTITION [IF EXISTS] partition_name -        note: -            1) Use a partitioned table to keep at least one partition. -            2) Execute DROP PARTITION For a period of time, the deleted partition can be recovered by the RECOVER statement. See the RECOVER statement for details. -             -    3. Modify the partition properties -        grammar: -            MODIFY PARTITION partition_name SET ("key" = "value", ...) -        Description: -            1) The storage_medium, storage_cooldown_time, and replication_num attributes of the modified partition are currently supported. -            2) For single-partition tables, partition_name is the same as the table name. -         -    Rollup supports the following ways to create: -    1. Create a rollup index -        grammar: -            ADD ROLLUP rollup_name (column_name1, column_name2, ...) -            [FROM from_index_name] -            [PROPERTIES ("key"="value", ...)] -        note: -            1) If from_index_name is not specified, it is created by default from base index -            2) The columns in the rollup table must be existing columns in from_index -            3) In properties, you can specify the storage format. See CREATE TABLE for details. -             -    2. Delete the rollup index -        grammar: -            DROP ROLLUP rollup_name -            [PROPERTIES ("key"="value", ...)] -        note: -            1) Cannot delete base index -            2) Execute DROP ROLLUP For a period of time, the deleted rollup index can be restored by the RECOVER statement. See the RECOVER statement for details. -     -             -    Schema change supports the following modifications: -    1. Add a column to the specified location of the specified index -        grammar: -            ADD COLUMN column_name column_type [KEY | agg_type] [DEFAULT "default_value"] -            [AFTER column_name|FIRST] -            [TO rollup_index_name] -            [PROPERTIES ("key"="value", ...)] -        note: -            1) Aggregate model If you add a value column, you need to specify agg_type -            2) Non-aggregate models (such as DUPLICATE KEY) If you add a key column, you need to specify the KEY keyword. -            3) You cannot add a column that already exists in the base index to the rollup index -                Recreate a rollup index if needed -             -    2. Add multiple columns to the specified index -        grammar: -            ADD COLUMN (column_name1 column_type [KEY | agg_type] DEFAULT "default_value", ...) -            [TO rollup_index_name] -            [PROPERTIES ("key"="value", ...)] -        note: -            1) Aggregate model If you add a value column, you need to specify agg_type -            2) Non-aggregate model If you add a key column, you need to specify the KEY keyword. -            3) You cannot add a column that already exists in the base index to the rollup index -            (You can recreate a rollup index if needed) -     -    3. Remove a column from the specified index -        grammar: -            DROP COLUMN column_name -            [FROM rollup_index_name] -        note: -            1) Cannot delete partition column -            2) If the column is removed from the base index, it will also be deleted if the column is included in the rollup index -         -    4. Modify the column type and column position of the specified index -        grammar: -            MODIFY COLUMN column_name column_type [KEY | agg_type] [NULL | NOT NULL] [DEFAULT "default_value"] -            [AFTER column_name|FIRST] -            [FROM rollup_index_name] -            [PROPERTIES ("key"="value", ...)] -        note: -            1) Aggregate model If you modify the value column, you need to specify agg_type -            2) Non-aggregate type If you modify the key column, you need to specify the KEY keyword. -            3) Only the type of the column can be modified. The other attributes of the column remain as they are (ie other attributes need to be explicitly written in the statement according to the original attribute, see example 8) -            4) The partition column cannot be modified -            5) The following types of conversions are currently supported (accuracy loss is guaranteed by the user) -                TINYINT/SMALLINT/INT/BIGINT is converted to TINYINT/SMALLINT/INT/BIGINT/DOUBLE. -                Convert LARGEINT to DOUBLE -                VARCHAR supports modification of maximum length + 2. Delete the partition + grammar: + DROP PARTITION [IF EXISTS] partition_name + note: + 1) Use a partitioned table to keep at least one partition. + 2) Execute DROP PARTITION For a period of time, the deleted partition can be recovered by the RECOVER statement. See the RECOVER statement for details. + + 3. Modify the partition properties + grammar: + MODIFY PARTITION partition_name SET ("key" = "value", ...) + Description: + 1) The storage_medium, storage_cooldown_time, and replication_num attributes of the modified partition are currently supported. + 2) For single-partition tables, partition_name is the same as the table name. + + Rollup supports the following ways to create: + 1. Create a rollup index + grammar: + ADD ROLLUP rollup_name (column_name1, column_name2, ...) + [FROM from_index_name] + [PROPERTIES ("key"="value", ...)] + note: + 1) If from_index_name is not specified, it is created by default from base index + 2) The columns in the rollup table must be existing columns in from_index + 3) In properties, you can specify the storage format. See CREATE TABLE for details. + + 2. Delete the rollup index + grammar: + DROP ROLLUP rollup_name + [PROPERTIES ("key"="value", ...)] + note: + 1) Cannot delete base index + 2) Execute DROP ROLLUP For a period of time, the deleted rollup index can be restored by the RECOVER statement. See the RECOVER statement for details. + + + Schema change supports the following modifications: + 1. Add a column to the specified location of the specified index + grammar: + ADD COLUMN column_name column_type [KEY | agg_type] [DEFAULT "default_value"] + [AFTER column_name|FIRST] + [TO rollup_index_name] + [PROPERTIES ("key"="value", ...)] + note: + 1) Aggregate model If you add a value column, you need to specify agg_type + 2) Non-aggregate models (such as DUPLICATE KEY) If you add a key column, you need to specify the KEY keyword. + 3) You cannot add a column that already exists in the base index to the rollup index + Recreate a rollup index if needed + + 2. Add multiple columns to the specified index + grammar: + ADD COLUMN (column_name1 column_type [KEY | agg_type] DEFAULT "default_value", ...) + [TO rollup_index_name] + [PROPERTIES ("key"="value", ...)] + note: + 1) Aggregate model If you add a value column, you need to specify agg_type + 2) Non-aggregate model If you add a key column, you need to specify the KEY keyword. + 3) You cannot add a column that already exists in the base index to the rollup index + (You can recreate a rollup index if needed) + + 3. Remove a column from the specified index + grammar: + DROP COLUMN column_name + [FROM rollup_index_name] + note: + 1) Cannot delete partition column + 2) If the column is removed from the base index, it will also be deleted if the column is included in the rollup index + + 4. Modify the column type and column position of the specified index + grammar: + MODIFY COLUMN column_name column_type [KEY | agg_type] [NULL | NOT NULL] [DEFAULT "default_value"] + [AFTER column_name|FIRST] + [FROM rollup_index_name] + [PROPERTIES ("key"="value", ...)] + note: + 1) Aggregate model If you modify the value column, you need to specify agg_type + 2) Non-aggregate type If you modify the key column, you need to specify the KEY keyword. + 3) Only the type of the column can be modified. The other attributes of the column remain as they are (ie other attributes need to be explicitly written in the statement according to the original attribute, see example 8) + 4) The partition column cannot be modified + 5) The following types of conversions are currently supported (accuracy loss is guaranteed by the user) + TINYINT/SMALLINT/INT/BIGINT is converted to TINYINT/SMALLINT/INT/BIGINT/DOUBLE. + Convert LARGEINT to DOUBLE + VARCHAR supports modification of maximum length Convert VARCHAR to TINYINT/SMALLINT/INT/BIGINT/LARGEINT/FLOAT/DOUBLE. Convert VARCHAR to DATE (currently support six formats: "%Y-%m-%d", "%y-%m-%d", "%Y%m%d", "%y%m%d", "%Y/%m/%d, "%y/%m/%d") Convert DATETIME to DATE(Only year-month-day information is retained, For example: `2019-12-09 21:47:05` <--> `2019-12-09`) Convert DATE to DATETIME(Set hour, minute, second to zero, For example: `2019-12-09` <--> `2019-12-09 00:00:00`) Convert FLOAT to DOUBLE Convert INT to DATE (If the INT data fails to convert, the original data remains the same) -            6) Does not support changing from NULL to NOT NULL -                 -    5. Reorder the columns of the specified index -        grammar: -            ORDER BY (column_name1, column_name2, ...) -            [FROM rollup_index_name] -            [PROPERTIES ("key"="value", ...)] -        note: -            1) All columns in index must be written -            2) value is listed after the key column -             -    6. Modify the properties of the table, currently supports modifying the bloom filter column and the colocate_with attribute. -        grammar: -            PROPERTIES ("key"="value") -        note: -            Can also be merged into the above schema change operation to modify, see the example below -      + 6) Does not support changing from NULL to NOT NULL + + 5. Reorder the columns of the specified index + grammar: + ORDER BY (column_name1, column_name2, ...) + [FROM rollup_index_name] + [PROPERTIES ("key"="value", ...)] + note: + 1) All columns in index must be written + 2) value is listed after the key column + + 6. Modify the properties of the table, currently supports modifying the bloom filter column and the colocate_with attribute. + grammar: + PROPERTIES ("key"="value") + note: + Can also be merged into the above schema change operation to modify, see the example below +   + + Rename supports modification of the following names: + 1. Modify the table name + grammar: + RENAME new_table_name; + + 2. Modify the rollup index name + grammar: + RENAME ROLLUP old_rollup_name new_rollup_name; + + 3. Modify the partition name + grammar: + RENAME PARTITION old_partition_name new_partition_name; + + Bitmap index supports the following modifications: + 1. create bitmap index + grammar: + ADD INDEX index_name [USING BITMAP] (column [, ...],) [COMMENT 'balabala']; + note: + 1. only supports bitmap index for current version + 2. BITMAP index only supports apply on single column + 2. drop index + grammar: + DROP INDEX index_name; -    Rename supports modification of the following names: -    1. Modify the table name -        grammar: -            RENAME new_table_name; -             -    2. Modify the rollup index name -        grammar: -            RENAME ROLLUP old_rollup_name new_rollup_name; -             -    3. Modify the partition name -        grammar: -            RENAME PARTITION old_partition_name new_partition_name; -       ## example -    [partition] -    1. Add partition, existing partition [MIN, 2013-01-01), add partition [2013-01-01, 2014-01-01), use default bucket mode -        ALTER TABLE example_db.my_table -        ADD PARTITION p1 VALUES LESS THAN ("2014-01-01"); -    2. Increase the partition and use the new number of buckets -        ALTER TABLE example_db.my_table -        ADD PARTITION p1 VALUES LESS THAN ("2015-01-01") -        DISTRIBUTED BY HASH(k1) BUCKETS 20; + [partition] + 1. Add partition, existing partition [MIN, 2013-01-01), add partition [2013-01-01, 2014-01-01), use default bucket mode + ALTER TABLE example_db.my_table + ADD PARTITION p1 VALUES LESS THAN ("2014-01-01"); -    3. Increase the partition and use the new number of copies -        ALTER TABLE example_db.my_table -        ADD PARTITION p1 VALUES LESS THAN ("2015-01-01") -        ("replication_num"="1"); + 2. Increase the partition and use the new number of buckets + ALTER TABLE example_db.my_table + ADD PARTITION p1 VALUES LESS THAN ("2015-01-01") + DISTRIBUTED BY HASH(k1) BUCKETS 20; -    4. Modify the number of partition copies -        ALTER TABLE example_db.my_table -        MODIFY PARTITION p1 SET("replication_num"="1"); + 3. Increase the partition and use the new number of copies + ALTER TABLE example_db.my_table + ADD PARTITION p1 VALUES LESS THAN ("2015-01-01") + ("replication_num"="1"); -    5. Delete the partition -        ALTER TABLE example_db.my_table -        DROP PARTITION p1; -         -    6. Add a partition that specifies the upper and lower bounds + 4. Modify the number of partition copies + ALTER TABLE example_db.my_table + MODIFY PARTITION p1 SET("replication_num"="1"); -        ALTER TABLE example_db.my_table -        ADD PARTITION p1 VALUES [("2014-01-01"), ("2014-02-01")); + 5. Delete the partition + ALTER TABLE example_db.my_table + DROP PARTITION p1; + + 6. Add a partition that specifies the upper and lower bounds -    [rollup] -    1. Create index: example_rollup_index, based on base index(k1,k2,k3,v1,v2). Columnar storage. -        ALTER TABLE example_db.my_table -        ADD ROLLUP example_rollup_index(k1, k3, v1, v2) -        PROPERTIES("storage_type"="column"); -         -    2. Create index: example_rollup_index2, based on example_rollup_index(k1,k3,v1,v2) -        ALTER TABLE example_db.my_table -        ADD ROLLUP example_rollup_index2 (k1, v1) -        FROM example_rollup_index; -     -    3. Delete index: example_rollup_index2 -        ALTER TABLE example_db.my_table -        DROP ROLLUP example_rollup_index2; + ALTER TABLE example_db.my_table + ADD PARTITION p1 VALUES [("2014-01-01"), ("2014-02-01")); -    [schema change] -    1. Add a key column new_col to the col1 of example_rollup_index (non-aggregate model) -        ALTER TABLE example_db.my_table -        ADD COLUMN new_col INT KEY DEFAULT "0" AFTER col1 -        TO example_rollup_index; + [rollup] + 1. Create index: example_rollup_index, based on base index(k1,k2,k3,v1,v2). Columnar storage. + ALTER TABLE example_db.my_table + ADD ROLLUP example_rollup_index(k1, k3, v1, v2) + PROPERTIES("storage_type"="column"); + + 2. Create index: example_rollup_index2, based on example_rollup_index(k1,k3,v1,v2) + ALTER TABLE example_db.my_table + ADD ROLLUP example_rollup_index2 (k1, v1) + FROM example_rollup_index; + + 3. Delete index: example_rollup_index2 + ALTER TABLE example_db.my_table + DROP ROLLUP example_rollup_index2; -    2. Add a value column new_col to the col1 of example_rollup_index (non-aggregate model) -          ALTER TABLE example_db.my_table -          ADD COLUMN new_col INT DEFAULT "0" AFTER col1 -          TO example_rollup_index; + [schema change] + 1. Add a key column new_col to the col1 of example_rollup_index (non-aggregate model) + ALTER TABLE example_db.my_table + ADD COLUMN new_col INT KEY DEFAULT "0" AFTER col1 + TO example_rollup_index; -    3. Add a key column new_col (aggregation model) to col1 of example_rollup_index -          ALTER TABLE example_db.my_table -          ADD COLUMN new_col INT DEFAULT "0" AFTER col1 -          TO example_rollup_index; + 2. Add a value column new_col to the col1 of example_rollup_index (non-aggregate model) +   ALTER TABLE example_db.my_table +   ADD COLUMN new_col INT DEFAULT "0" AFTER col1 +   TO example_rollup_index; -    4. Add a value column to the col1 of example_rollup_index. new_col SUM aggregation type (aggregation model) -          ALTER TABLE example_db.my_table -          ADD COLUMN new_col INT SUM DEFAULT "0" AFTER col1 -          TO example_rollup_index; -     -    5. Add multiple columns to the example_rollup_index (aggregate model) -        ALTER TABLE example_db.my_table -        ADD COLUMN (col1 INT DEFAULT "1", col2 FLOAT SUM DEFAULT "2.3") -        TO example_rollup_index; -     -    6. Remove a column from example_rollup_index -        ALTER TABLE example_db.my_table -        DROP COLUMN col2 -        FROM example_rollup_index; -         -    7. Modify the base index's col1 column to be of type BIGINT and move to the col2 column -        ALTER TABLE example_db.my_table -        MODIFY COLUMN col1 BIGINT DEFAULT "1" AFTER col2; + 3. Add a key column new_col (aggregation model) to col1 of example_rollup_index +   ALTER TABLE example_db.my_table +   ADD COLUMN new_col INT DEFAULT "0" AFTER col1 +   TO example_rollup_index; -    8. Modify the maximum length of the val1 column of the base index. The original val1 is (val1 VARCHAR(32) REPLACE DEFAULT "abc") -        ALTER TABLE example_db.my_table -        MODIFY COLUMN val1 VARCHAR(64) REPLACE DEFAULT "abc"; -     -    9. Reorder the columns in example_rollup_index (set the original column order: k1, k2, k3, v1, v2) -        ALTER TABLE example_db.my_table -        ORDER BY (k3, k1, k2, v2, v1) -        FROM example_rollup_index; -         -    10. Perform both operations simultaneously -        ALTER TABLE example_db.my_table -        ADD COLUMN v2 INT MAX DEFAULT "0" AFTER k2 TO example_rollup_index, -        ORDER BY (k3,k1,k2,v2,v1) FROM example_rollup_index; + 4. Add a value column to the col1 of example_rollup_index. new_col SUM aggregation type (aggregation model) +   ALTER TABLE example_db.my_table +   ADD COLUMN new_col INT SUM DEFAULT "0" AFTER col1 +   TO example_rollup_index; + + 5. Add multiple columns to the example_rollup_index (aggregate model) + ALTER TABLE example_db.my_table + ADD COLUMN (col1 INT DEFAULT "1", col2 FLOAT SUM DEFAULT "2.3") + TO example_rollup_index; + + 6. Remove a column from example_rollup_index + ALTER TABLE example_db.my_table + DROP COLUMN col2 + FROM example_rollup_index; + + 7. Modify the base index's col1 column to be of type BIGINT and move to the col2 column + ALTER TABLE example_db.my_table + MODIFY COLUMN col1 BIGINT DEFAULT "1" AFTER col2; -    11. Modify the bloom filter column of the table -        ALTER TABLE example_db.my_table SET ("bloom_filter_columns"="k1,k2,k3"); + 8. Modify the maximum length of the val1 column of the base index. The original val1 is (val1 VARCHAR(32) REPLACE DEFAULT "abc") + ALTER TABLE example_db.my_table + MODIFY COLUMN val1 VARCHAR(64) REPLACE DEFAULT "abc"; + + 9. Reorder the columns in example_rollup_index (set the original column order: k1, k2, k3, v1, v2) + ALTER TABLE example_db.my_table + ORDER BY (k3, k1, k2, v2, v1) + FROM example_rollup_index; + + 10. Perform both operations simultaneously + ALTER TABLE example_db.my_table + ADD COLUMN v2 INT MAX DEFAULT "0" AFTER k2 TO example_rollup_index, + ORDER BY (k3,k1,k2,v2,v1) FROM example_rollup_index; -        Can also be merged into the above schema change operation (note that the syntax of multiple clauses is slightly different) -        ALTER TABLE example_db.my_table -        DROP COLUMN col2 -        PROPERTIES ("bloom_filter_columns"="k1,k2,k3"); + 11. Modify the bloom filter column of the table + ALTER TABLE example_db.my_table SET ("bloom_filter_columns"="k1,k2,k3"); -    12. Modify the Colocate property of the table + Can also be merged into the above schema change operation (note that the syntax of multiple clauses is slightly different) + ALTER TABLE example_db.my_table + DROP COLUMN col2 + PROPERTIES ("bloom_filter_columns"="k1,k2,k3"); -        ALTER TABLE example_db.my_table set ("colocate_with" = "t1"); + 12. Modify the Colocate property of the table -    13. Change the bucketing mode of the table from Random Distribution to Hash Distribution + ALTER TABLE example_db.my_table set ("colocate_with" = "t1"); + + 13. Change the bucketing mode of the table from Random Distribution to Hash Distribution + + ALTER TABLE example_db.my_table set ("distribution_type" = "hash"); + + [rename] + 1. Modify the table named table1 to table2 + ALTER TABLE table1 RENAME table2; + + 2. Modify the rollup index named rollup1 in the table example_table to rollup2 + ALTER TABLE example_table RENAME ROLLUP rollup1 rollup2; + + 3. Modify the partition named p1 in the table example_table to p2 + ALTER TABLE example_table RENAME PARTITION p1 p2; + + [index] + 1. create index on table1 column siteid using bitmap + ALTER TABLE table1 ADD INDEX index_name [USING BITMAP] (siteid) COMMENT 'balabala'; + 2. drop bitmap index of table1 + ALTER TABLE table1 DROP INDEX index_name; -        ALTER TABLE example_db.my_table set ("distribution_type" = "hash"); -         -    [rename] -    1. Modify the table named table1 to table2 -        ALTER TABLE table1 RENAME table2; -         -    2. Modify the rollup index named rollup1 in the table example_table to rollup2 -        ALTER TABLE example_table RENAME ROLLUP rollup1 rollup2; -         -    3. Modify the partition named p1 in the table example_table to p2 -        ALTER TABLE example_table RENAME PARTITION p1 p2; -         ## keyword -    ALTER, TABLE, ROLLUP, COLUMN, PARTITION, RENAME + + ALTER, TABLE, ROLLUP, COLUMN, PARTITION, RENAME diff --git a/docs/documentation/en/sql-reference/sql-statements/Data Definition/CREATE INDEX_EN.md b/docs/documentation/en/sql-reference/sql-statements/Data Definition/CREATE INDEX_EN.md new file mode 100644 index 00000000000000..819a5b5dbb434e --- /dev/null +++ b/docs/documentation/en/sql-reference/sql-statements/Data Definition/CREATE INDEX_EN.md @@ -0,0 +1,38 @@ + + +# CREATE INDEX + +## description + + This statement is used to create index + grammer: + CREATE INDEX index_name ON table_name (column [, ...],) [USING BITMAP] [COMMENT'balabala']; + note: + 1. only support bitmap index in current version + 2. BITMAP index only supports apply to single column + +## example + + 1. create index on table1 column siteid using bitmap + CREATE INDEX index_name ON table1 (siteid) USING BITMAP COMMENT 'balabala'; + +## keyword + + CREATE,INDEX diff --git a/docs/documentation/en/sql-reference/sql-statements/Data Definition/CREATE TABLE_EN.md b/docs/documentation/en/sql-reference/sql-statements/Data Definition/CREATE TABLE_EN.md index 9879cce62c3e55..6a6e14cea0181c 100644 --- a/docs/documentation/en/sql-reference/sql-statements/Data Definition/CREATE TABLE_EN.md +++ b/docs/documentation/en/sql-reference/sql-statements/Data Definition/CREATE TABLE_EN.md @@ -18,12 +18,16 @@ under the License. --> # CREATE TABLE + ## description -### Syntax +This statement is used to create table +Syntax: +``` CREATE [EXTERNAL] TABLE [IF NOT EXISTS] [database.]table_name - (column_definition1[, column_definition2, ...]) + (column_definition1[, column_definition2, ...] + [, index_definition1[, ndex_definition12,]]) [ENGINE = [olap|mysql|broker]] [key_desc] [COMMENT "table comment"] @@ -31,17 +35,15 @@ under the License. [distribution_desc] [PROPERTIES ("key"="value", ...)] [BROKER PROPERTIES ("key"="value", ...)]; - -1. column_definition - - Syntax: +``` - col_name col_type [agg_type] [NULL | NOT NULL] [DEFAULT "default_value"] - - Explain: +1. column_definition + Syntax: + `col_name col_type [agg_type] [NULL | NOT NULL] [DEFAULT "default_value"]` + Explain: col_name: Name of column col_type: Type of column - + ``` TINYINT(1 Byte) Range: -2^7 + 1 ~ 2^7 - 1 SMALLINT(2 Bytes) @@ -77,24 +79,27 @@ under the License. BITMAP BITMAP type, No need to specify length. This type can only be queried by BITMAP_UNION、BITMAP_COUNT、TO_BITMAP functions. - - agg_type: Aggregation type. If not specified, the column is key column. Otherwise, the column is value column. - - * SUM、MAX、MIN、REPLACE - * HLL_UNION: Only for HLL type - * REPLACE_IF_NOT_NULL: The meaning of this aggregation type is that substitution will occur if and only if the newly imported data is a non-null value. If the newly imported data is null, Doris will still retain the original value. Note: if NOT NULL is specified in the REPLACE_IF_NOT_NULL column when the user creates the table, Doris will convert it to NULL and will not report an error to the user. Users can leverage this aggregate type to achieve importing some of columns. - * BITMAP_UNION: Only for BITMAP type - + ``` + agg_type: Aggregation type. If not specified, the column is key column. Otherwise, the column is value column. + * SUM、MAX、MIN、REPLACE + * HLL_UNION: Only for HLL type + * REPLACE_IF_NOT_NULL: The meaning of this aggregation type is that substitution will occur if and only if the newly imported data is a non-null value. If the newly imported data is null, Doris will still retain the original value. Note: if NOT NULL is specified in the REPLACE_IF_NOT_NULL column when the user creates the table, Doris will convert it to NULL and will not report an error to the user. Users can leverage this aggregate type to achieve importing some of columns. + * BITMAP_UNION: Only for BITMAP type Allow NULL: Default is NOT NULL. NULL value should be represented as `\N` in load source file. - Notice: The origin value of BITMAP_UNION column should be TINYINT, SMALLINT, INT. - -2. ENGINE type - +2. index_definition + Syntax: + `INDEX index_name (col_name[, col_name, ...]) [USING BITMAP] COMMENT 'xxxxxx'` + Explain: + index_name:index name + col_name:column name + Notice: + Only support BITMAP index in current version, BITMAP can only apply to single column +3. ENGINE type Default is olap. Options are: olap, mysql, broker 1) For mysql, properties should include: - + ``` PROPERTIES ( "host" = "mysql_server_host", @@ -105,16 +110,13 @@ under the License. "table" = "table_name" ) ``` - - Notice: + Notice: "table_name" is the real table name in MySQL database. table_name in CREATE TABLE stmt is table is Doris. They can be different or same. - MySQL table created in Doris is for accessing data in MySQL database. Doris does not maintain and store any data from MySQL table. - - 2) For broker, properties should include: + 1) For broker, properties should include: ``` PROPERTIES ( @@ -127,50 +129,34 @@ under the License. ``` BROKER PROPERTIES( - "username" = "name", + "username" = "name", "password" = "password" ) ``` - - For different broker, the broker properties are different - - Notice: - - Files name in "path" is separated by ",". If file name includes ",", use "%2c" instead. If file name includes "%", use "%25" instead. + For different broker, the broker properties are different + Notice: + Files name in "path" is separated by ",". If file name includes ",", use "%2c" instead. If file name includes "%", use "%25" instead. Support CSV and Parquet. Support GZ, BZ2, LZ4, LZO(LZOP) - -3. key_desc - - Syntax: +4. key_desc + Syntax: key_type(k1[,k2 ...]) - - Explain: - - Data is orderd by specified key columns. And has different behaviors for different key desc. - + Explain: + Data is orderd by specified key columns. And has different behaviors for different key desc. AGGREGATE KEY: - value columns will be aggregated is key columns are same. - UNIQUE KEY: - The new incoming rows will replace the old rows if key columns are same. - DUPLICATE KEY: - All incoming rows will be saved. - the default key_type is DUPLICATE KEY, and key columns are first 36 bytes of the columns in define order. - If the number of columns in the first 36 is less than 3, the first 3 columns will be used. - - NOTICE: + the default key_type is DUPLICATE KEY, and key columns are first 36 bytes of the columns in define order. + If the number of columns in the first 36 is less than 3, the first 3 columns will be used. + NOTICE: Except for AGGREGATE KEY, no need to specify aggregation type for value columns. - -4. partition_desc +5. partition_desc Partition has two ways to use: 1) LESS THAN - - Syntex: + Syntax: ``` PARTITION BY RANGE (k1, k2, ...) @@ -181,42 +167,35 @@ under the License. ) ``` - Explain: - + Explain: 1) Partition name only support [A-z0-9_] 2) Partition key column's type should be: TINYINT, SMALLINT, INT, BIGINT, LARGEINT, DATE, DATETIME - 3) The range is [closed, open). And the lower bound of first partition is MIN VALUE of specifed column type. + 3) The range is [closed, open). And the lower bound of first partition is MIN VALUE of specifed column type. 4) NULL values should be save in partition which includes MIN VALUE. 5) Support multi partition columns, the the default partition value is MIN VALUE. - 2)Fixed Range - Syntex: - + Syntax: ``` PARTITION BY RANGE (k1, k2, k3, ...) ( - PARTITION partition_name1 VALUES [("k1-lower1", "k2-lower1", "k3-lower1",...), ("k1-upper1", "k2-upper1", "k3-upper1", ...)), - PARTITION partition_name2 VALUES [("k1-lower1-2", "k2-lower1-2", ...), ("k1-upper1-2", MAXVALUE, )) + PARTITION partition_name1 VALUES [("k1-lower1", "k2-lower1", "k3-lower1",...), ("k1-upper1", "k2-upper1", "k3-upper1", ...)), + PARTITION partition_name2 VALUES [("k1-lower1-2", "k2-lower1-2", ...), ("k1-upper1-2", MAXVALUE, )) "k3-upper1-2", ... ) ``` - Explain: - 1)The Fixed Range is more flexible than the LESS THAN, and the left and right intervals are completely determined by the user. - 2)Others are consistent with LESS THAN. - -5. distribution_desc + 1)The Fixed Range is more flexible than the LESS THAN, and the left and right intervals are completely determined by the user. + 2)Others are consistent with LESS THAN. +6. distribution_desc 1) Hash - Syntax: - DISTRIBUTED BY HASH (k1[,k2 ...]) [BUCKETS num] - Explain: + Syntax: + `DISTRIBUTED BY HASH (k1[,k2 ...]) [BUCKETS num]` + Explain: The default buckets is 10. +7. PROPERTIES + 1) If ENGINE type is olap. User can specify storage medium, cooldown time and replication number: -6. PROPERTIES - - 1) If ENGINE type is olap. User can specify storage medium, cooldown time and replication number: - ``` PROPERTIES ( "storage_medium" = "[SSD|HDD]", @@ -224,249 +203,264 @@ under the License. ["replication_num" = "3"] ) ``` - + storage_medium: SSD or HDD - storage_cooldown_time: If storage_medium is SSD, data will be automatically moved to HDD when timeout. + storage_cooldown_time: If storage_medium is SSD, data will be automatically moved to HDD when timeout. Default is 7 days. Format: "yyyy-MM-dd HH:mm:ss" replication_num: Replication number of a partition. Default is 3. - - If table is not range partitions. This property takes on Table level. Or it will takes on Partition level. - - User can specify different properties for different partition by `ADD PARTITION` or `MODIFY PARTITION` statements. - - 3) If Engine type is olap, user can set bloom filter index for column. + If table is not range partitions. This property takes on Table level. Or it will takes on Partition level. + User can specify different properties for different partition by `ADD PARTITION` or `MODIFY PARTITION` statements. + 2) If Engine type is olap, user can set bloom filter index for column. + Bloom filter index will be used when query contains `IN` or `EQUAL`. + Bloom filter index support key columns with type except TINYINT FLOAT DOUBLE, also support value with REPLACE aggregation type. - Bloom filter index will be used when query contains `IN` or `EQUAL`. - Bloom filter index support key columns with type except TINYINT FLOAT DOUBLE, also support value with REPLACE aggregation type. - ``` PROPERTIES ( "bloom_filter_columns"="k1,k2,k3" ) ``` - 4) For Colocation Join: + 3) For Colocation Join: ``` PROPERTIES ( "colocate_with"="table1" ) ``` - -## example - 1. Create an olap table, distributed by hash, with aggregation type. - - ``` - CREATE TABLE example_db.table_hash - ( - k1 TINYINT, - k2 DECIMAL(10, 2) DEFAULT "10.5", - v1 CHAR(10) REPLACE, - v2 INT SUM - ) - ENGINE=olap - AGGREGATE KEY(k1, k2) - COMMENT "my first doris table" - DISTRIBUTED BY HASH(k1) BUCKETS 32 - PROPERTIES ("storage_type"="column"); - ``` - - 2. Create an olap table, distributed by hash, with aggregation type. Also set storage medium and cooldown time. - - ``` - CREATE TABLE example_db.table_hash - ( - k1 BIGINT, - k2 LARGEINT, - v1 VARCHAR(2048) REPLACE, - v2 SMALLINT SUM DEFAULT "10" - ) - ENGINE=olap - UNIQUE KEY(k1, k2) - DISTRIBUTED BY HASH (k1, k2) BUCKETS 32 - PROPERTIES( - "storage_type"="column", - "storage_medium" = "SSD", - "storage_cooldown_time" = "2015-06-04 00:00:00" - ); - - 3. Create an olap table, with range partitioned, distributed by hash. - 1) LESS THAN - ``` - CREATE TABLE example_db.table_range - ( - k1 DATE, - k2 INT, - k3 SMALLINT, - v1 VARCHAR(2048), - v2 DATETIME DEFAULT "2014-02-04 15:36:00" - ) - ENGINE=olap - DUPLICATE KEY(k1, k2, k3) - PARTITION BY RANGE (k1) - ( - PARTITION p1 VALUES LESS THAN ("2014-01-01"), - PARTITION p2 VALUES LESS THAN ("2014-06-01"), - PARTITION p3 VALUES LESS THAN ("2014-12-01") - ) - DISTRIBUTED BY HASH(k2) BUCKETS 32 - PROPERTIES( - "storage_medium" = "SSD", "storage_cooldown_time" = "2015-06-04 00:00:00" - ); - ``` - - Explain: - This statement will create 3 partitions: - - ``` - ( { MIN }, {"2014-01-01"} ) - [ {"2014-01-01"}, {"2014-06-01"} ) - [ {"2014-06-01"}, {"2014-12-01"} ) - ``` - - Data outside these ranges will not be loaded. - - 2) Fixed Range - CREATE TABLE table_range - ( - k1 DATE, - k2 INT, - k3 SMALLINT, - v1 VARCHAR(2048), - v2 DATETIME DEFAULT "2014-02-04 15:36:00" - ) - ENGINE=olap - DUPLICATE KEY(k1, k2, k3) - PARTITION BY RANGE (k1, k2, k3) - ( - PARTITION p1 VALUES [("2014-01-01", "10", "200"), ("2014-01-01", "20", "300")), - PARTITION p2 VALUES [("2014-06-01", "100", "200"), ("2014-07-01", "100", "300")) - ) - DISTRIBUTED BY HASH(k2) BUCKETS 32 - PROPERTIES( - "storage_medium" = "SSD" - ); - - 4. Create a mysql table - - ``` - CREATE TABLE example_db.table_mysql - ( - k1 DATE, - k2 INT, - k3 SMALLINT, - k4 VARCHAR(2048), - k5 DATETIME - ) - ENGINE=mysql - PROPERTIES - ( - "host" = "127.0.0.1", - "port" = "8239", - "user" = "mysql_user", - "password" = "mysql_passwd", - "database" = "mysql_db_test", - "table" = "mysql_table_test" - ); - ``` - - 5. Create a broker table, with file on HDFS, line delimit by "|", column separated by "\n" - - ``` - CREATE EXTERNAL TABLE example_db.table_broker ( - k1 DATE, - k2 INT, - k3 SMALLINT, - k4 VARCHAR(2048), - k5 DATETIME - ) - ENGINE=broker - PROPERTIES ( - "broker_name" = "hdfs", - "path" = "hdfs://hdfs_host:hdfs_port/data1,hdfs://hdfs_host:hdfs_port/data2,hdfs://hdfs_host:hdfs_port/data3%2c4", - "column_separator" = "|", - "line_delimiter" = "\n" - ) - BROKER PROPERTIES ( - "username" = "hdfs_user", - "password" = "hdfs_password" - ); - ``` - 6. Create table will HLL column - - ``` - CREATE TABLE example_db.example_table - ( - k1 TINYINT, - k2 DECIMAL(10, 2) DEFAULT "10.5", - v1 HLL HLL_UNION, - v2 HLL HLL_UNION - ) - ENGINE=olap - AGGREGATE KEY(k1, k2) - DISTRIBUTED BY HASH(k1) BUCKETS 32; - ``` - - 7. Create a table will BITMAP_UNION column - - ``` - CREATE TABLE example_db.example_table - ( - k1 TINYINT, - k2 DECIMAL(10, 2) DEFAULT "10.5", - v1 BITMAP BITMAP_UNION, - v2 BITMAP BITMAP_UNION - ) - ENGINE=olap - AGGREGATE KEY(k1, k2) - DISTRIBUTED BY HASH(k1) BUCKETS 32; - ``` - - 8. Create 2 colocate join table. - - ``` - CREATE TABLE `t1` ( - `id` int(11) COMMENT "", - `value` varchar(8) COMMENT "" - ) ENGINE=OLAP - DUPLICATE KEY(`id`) - DISTRIBUTED BY HASH(`id`) BUCKETS 10 - PROPERTIES ( - "colocate_with" = "group1" - ); - - CREATE TABLE `t2` ( - `id` int(11) COMMENT "", - `value` varchar(8) COMMENT "" - ) ENGINE=OLAP - DUPLICATE KEY(`id`) - DISTRIBUTED BY HASH(`id`) BUCKETS 10 - PROPERTIES ( - "colocate_with" = "group1" - ); - ``` +## example - 9. Create a broker table, with file on BOS. +1. Create an olap table, distributed by hash, with aggregation type. + + ``` + CREATE TABLE example_db.table_hash + ( + k1 TINYINT, + k2 DECIMAL(10, 2) DEFAULT "10.5", + v1 CHAR(10) REPLACE, + v2 INT SUM + ) + ENGINE=olap + AGGREGATE KEY(k1, k2) + COMMENT "my first doris table" + DISTRIBUTED BY HASH(k1) BUCKETS 32 + PROPERTIES ("storage_type"="column"); + ``` + +2. Create an olap table, distributed by hash, with aggregation type. Also set storage mediumand cooldown time. + + ``` + CREATE TABLE example_db.table_hash + ( + k1 BIGINT, + k2 LARGEINT, + v1 VARCHAR(2048) REPLACE, + v2 SMALLINT SUM DEFAULT "10" + ) + ENGINE=olap + UNIQUE KEY(k1, k2) + DISTRIBUTED BY HASH (k1, k2) BUCKETS 32 + PROPERTIES( + "storage_type"="column", + "storage_medium" = "SSD", + "storage_cooldown_time" = "2015-06-04 00:00:00" + ); + +3. Create an olap table, with range partitioned, distributed by hash. + +1) LESS THAN + + ``` + CREATE TABLE example_db.table_range + ( + k1 DATE, + k2 INT, + k3 SMALLINT, + v1 VARCHAR(2048), + v2 DATETIME DEFAULT "2014-02-04 15:36:00" + ) + ENGINE=olap + DUPLICATE KEY(k1, k2, k3) + PARTITION BY RANGE (k1) + ( + PARTITION p1 VALUES LESS THAN ("2014-01-01"), + PARTITION p2 VALUES LESS THAN ("2014-06-01"), + PARTITION p3 VALUES LESS THAN ("2014-12-01") + ) + DISTRIBUTED BY HASH(k2) BUCKETS 32 + PROPERTIES( + "storage_medium" = "SSD", "storage_cooldown_time" = "2015-06-04 00:00:00" + ); + ``` - ``` - CREATE EXTERNAL TABLE example_db.table_broker ( - k1 DATE - ) - ENGINE=broker - PROPERTIES ( - "broker_name" = "bos", - "path" = "bos://my_bucket/input/file", - ) - BROKER PROPERTIES ( - "bos_endpoint" = "http://bj.bcebos.com", - "bos_accesskey" = "xxxxxxxxxxxxxxxxxxxxxxxxxx", - "bos_secret_accesskey"="yyyyyyyyyyyyyyyyyyyy" - ); - ``` + Explain: + This statement will create 3 partitions: + + ``` + ( { MIN }, {"2014-01-01"} ) + [ {"2014-01-01"}, {"2014-06-01"} ) + [ {"2014-06-01"}, {"2014-12-01"} ) + ``` + + Data outside these ranges will not be loaded. + +2) Fixed Range + CREATE TABLE table_range + ( + k1 DATE, + k2 INT, + k3 SMALLINT, + v1 VARCHAR(2048), + v2 DATETIME DEFAULT "2014-02-04 15:36:00" + ) + ENGINE=olap + DUPLICATE KEY(k1, k2, k3) + PARTITION BY RANGE (k1, k2, k3) + ( + PARTITION p1 VALUES [("2014-01-01", "10", "200"), ("2014-01-01", "20", "300")), + PARTITION p2 VALUES [("2014-06-01", "100", "200"), ("2014-07-01", "100", "300")) + ) + DISTRIBUTED BY HASH(k2) BUCKETS 32 + PROPERTIES( + "storage_medium" = "SSD" + ); + +4. Create a mysql table + + ``` + CREATE TABLE example_db.table_mysql + ( + k1 DATE, + k2 INT, + k3 SMALLINT, + k4 VARCHAR(2048), + k5 DATETIME + ) + ENGINE=mysql + PROPERTIES + ( + "host" = "127.0.0.1", + "port" = "8239", + "user" = "mysql_user", + "password" = "mysql_passwd", + "database" = "mysql_db_test", + "table" = "mysql_table_test" + ); + ``` + +5. Create a broker table, with file on HDFS, line delimit by "|", column separated by "\n" + + ``` + CREATE EXTERNAL TABLE example_db.table_broker ( + k1 DATE, + k2 INT, + k3 SMALLINT, + k4 VARCHAR(2048), + k5 DATETIME + ) + ENGINE=broker + PROPERTIES ( + "broker_name" = "hdfs", + "path" = "hdfs://hdfs_host:hdfs_port/data1,hdfs://hdfs_host:hdfs_port/data2,hdfs://hdfs_host:hdfs_port/data3%2c4", + "column_separator" = "|", + "line_delimiter" = "\n" + ) + BROKER PROPERTIES ( + "username" = "hdfs_user", + "password" = "hdfs_password" + ); + ``` + +6. Create table will HLL column + + ``` + CREATE TABLE example_db.example_table + ( + k1 TINYINT, + k2 DECIMAL(10, 2) DEFAULT "10.5", + v1 HLL HLL_UNION, + v2 HLL HLL_UNION + ) + ENGINE=olap + AGGREGATE KEY(k1, k2) + DISTRIBUTED BY HASH(k1) BUCKETS 32; + ``` + +7. Create a table will BITMAP_UNION column + + ``` + CREATE TABLE example_db.example_table + ( + k1 TINYINT, + k2 DECIMAL(10, 2) DEFAULT "10.5", + v1 BITMAP BITMAP_UNION, + v2 BITMAP BITMAP_UNION + ) + ENGINE=olap + AGGREGATE KEY(k1, k2) + DISTRIBUTED BY HASH(k1) BUCKETS 32; + ``` + +8. Create 2 colocate join table. + + ``` + CREATE TABLE `t1` ( + `id` int(11) COMMENT "", + `value` varchar(8) COMMENT "" + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + DISTRIBUTED BY HASH(`id`) BUCKETS 10 + PROPERTIES ( + "colocate_with" = "group1" + ); + CREATE TABLE `t2` ( + `id` int(11) COMMENT "", + `value` varchar(8) COMMENT "" + ) ENGINE=OLAP + DUPLICATE KEY(`id`) + DISTRIBUTED BY HASH(`id`) BUCKETS 10 + PROPERTIES ( + "colocate_with" = "group1" + ); + ``` + +9. Create a broker table, with file on BOS. + + ``` + CREATE EXTERNAL TABLE example_db.table_broker ( + k1 DATE + ) + ENGINE=broker + PROPERTIES ( + "broker_name" = "bos", + "path" = "bos://my_bucket/input/file", + ) + BROKER PROPERTIES ( + "bos_endpoint" = "http://bj.bcebos.com", + "bos_accesskey" = "xxxxxxxxxxxxxxxxxxxxxxxxxx", + "bos_secret_accesskey"="yyyyyyyyyyyyyyyyyyyy" + ); + ``` + +10. 创建一个带有bitmap 索引的表 + + ``` + CREATE TABLE example_db.table_hash + ( + k1 TINYINT, + k2 DECIMAL(10, 2) DEFAULT "10.5", + v1 CHAR(10) REPLACE, + v2 INT SUM, + INDEX k1_idx (k1) USING BITMAP COMMENT 'xxxxxx' + ) + ENGINE=olap + AGGREGATE KEY(k1, k2) + COMMENT "my first doris table" + DISTRIBUTED BY HASH(k1) BUCKETS 32 + PROPERTIES ("storage_type"="column"); + ``` ## keyword CREATE,TABLE - diff --git a/docs/documentation/en/sql-reference/sql-statements/Data Definition/DROP INDEX_EN.md b/docs/documentation/en/sql-reference/sql-statements/Data Definition/DROP INDEX_EN.md new file mode 100644 index 00000000000000..b13d79e8772e2b --- /dev/null +++ b/docs/documentation/en/sql-reference/sql-statements/Data Definition/DROP INDEX_EN.md @@ -0,0 +1,30 @@ + + +# DROP INDEX + +## description + + This statement is used to delete index from table + grammer: + DROP INDEX index_name ON [db_name.]table_name; + +## keyword + + DROP,INDEX diff --git a/docs/documentation/en/sql-reference/sql-statements/Data Manipulation/SHOW PARTITIONS_EN.md b/docs/documentation/en/sql-reference/sql-statements/Data Manipulation/SHOW PARTITIONS_EN.md index 660787a1c973b1..dde28ea3507689 100644 --- a/docs/documentation/en/sql-reference/sql-statements/Data Manipulation/SHOW PARTITIONS_EN.md +++ b/docs/documentation/en/sql-reference/sql-statements/Data Manipulation/SHOW PARTITIONS_EN.md @@ -21,14 +21,20 @@ under the License. ## Description This statement is used to display partition information Grammar: -SHOW PARTITIONS FROM [db_name.]table_name [PARTITION partition_name]; +SHOW PARTITIONS FROM [db_name.]table_name [WHERE] [ORDER BY] [LIMIT]; +Explain: +Support filter with following columns: PartitionId,PartitionName,State,Buckets,ReplicationNum, +LastConsistencyCheckTime ## example 1. Display partition information for the specified table below the specified DB SHOW PARTITIONS FROM example_db.table_name; -1. Display information about the specified partition of the specified table below the specified DB -SHOW PARTITIONS FROM example_db.table_name PARTITION p1; +2. Display information about the specified partition of the specified table below the specified DB +SHOW PARTITIONS FROM example_db.table_name WHERE PartitionName = "p1"; + +3. Display information about the newest partition of the specified table below the specified DB +SHOW PARTITIONS FROM example_db.table_name ORDER BY PartitionId DESC LIMIT 1; ## keyword SHOW,PARTITIONS diff --git a/fe/pom.xml b/fe/pom.xml index e4c47a7b34450a..54e1658c05e7f9 100644 --- a/fe/pom.xml +++ b/fe/pom.xml @@ -531,6 +531,14 @@ under the License. oshi-core 4.0.0 + + + + org.jboss.xnio + xnio-nio + 3.6.5.Final + + diff --git a/fe/src/main/cup/sql_parser.cup b/fe/src/main/cup/sql_parser.cup index f330cb7598e054..9593dc9dd00f3f 100644 --- a/fe/src/main/cup/sql_parser.cup +++ b/fe/src/main/cup/sql_parser.cup @@ -196,7 +196,7 @@ terminal String KW_ADD, KW_ADMIN, KW_AFTER, KW_AGGREGATE, KW_ALL, KW_ALTER, KW_A KW_COLLATE, KW_COLLATION, KW_COLUMN, KW_COLUMNS, KW_COMMENT, KW_COMMIT, KW_COMMITTED, KW_CONFIG, KW_CONNECTION, KW_CONNECTION_ID, KW_CONSISTENT, KW_COUNT, KW_CREATE, KW_CROSS, KW_CURRENT, KW_CURRENT_USER, KW_DATA, KW_DATABASE, KW_DATABASES, KW_DATE, KW_DATETIME, KW_TIME, KW_DECIMAL, KW_DECOMMISSION, KW_DEFAULT, KW_DESC, KW_DESCRIBE, - KW_DELETE, KW_DISTINCT, KW_DISTINCTPC, KW_DISTINCTPCSA, KW_DISTRIBUTED, KW_DISTRIBUTION, KW_BUCKETS, KW_DIV, KW_DOUBLE, KW_DROP, KW_DROPP, KW_DUPLICATE, + KW_DELETE, KW_DISTINCT, KW_DISTINCTPC, KW_DISTINCTPCSA, KW_DISTRIBUTED, KW_DISTRIBUTION, KW_DYNAMIC, KW_BUCKETS, KW_DIV, KW_DOUBLE, KW_DROP, KW_DROPP, KW_DUPLICATE, KW_ELSE, KW_END, KW_ENGINE, KW_ENGINES, KW_ENTER, KW_ERRORS, KW_EVENTS, KW_EXISTS, KW_EXPORT, KW_EXTERNAL, KW_EXTRACT, KW_FALSE, KW_FOLLOWER, KW_FOLLOWING, KW_FREE, KW_FROM, KW_FILE, KW_FIRST, KW_FLOAT, KW_FOR, KW_FORMAT, KW_FRONTEND, KW_FRONTENDS, KW_FULL, KW_FUNCTION, KW_GLOBAL, KW_GRANT, KW_GRANTS, KW_GROUP, @@ -358,7 +358,9 @@ nonterminal List option_value_list, option_value_list_continued, start_o nonterminal Map key_value_map, opt_key_value_map, opt_properties, opt_ext_properties; nonterminal ColumnDef column_definition; +nonterminal IndexDef index_definition; nonterminal ArrayList column_definition_list; +nonterminal ArrayList index_definition_list; nonterminal AggregateType opt_agg_type; nonterminal PartitionDesc opt_partition; nonterminal DistributionDesc opt_distribution; @@ -417,6 +419,7 @@ nonterminal String opt_db, opt_partition_name, procedure_or_function, opt_commen nonterminal ColumnDef.DefaultValue opt_default_value; nonterminal Boolean opt_if_exists, opt_if_not_exists; nonterminal Boolean opt_external; +nonterminal IndexDef.IndexType opt_index_type; nonterminal ShowAlterStmt.AlterType opt_alter_type; @@ -780,6 +783,14 @@ alter_table_clause ::= {: RESULT = new ColumnRenameClause(colName, newColName); :} + | KW_ADD index_definition:indexDef + {: + RESULT = new CreateIndexClause(null, indexDef, true); + :} + | KW_DROP KW_INDEX ident:indexName + {: + RESULT = new DropIndexClause(indexName, null, true); + :} ; alter_system_clause ::= @@ -896,7 +907,20 @@ create_stmt ::= opt_properties:tblProperties opt_ext_properties:extProperties {: - RESULT = new CreateTableStmt(ifNotExists, isExternal, name, columns, engineName, keys, partition, distribution, tblProperties, extProperties, tableComment); + RESULT = new CreateTableStmt(ifNotExists, isExternal, name, columns, engineName, keys, partition, + distribution, tblProperties, extProperties, tableComment); + :} + | KW_CREATE opt_external:isExternal KW_TABLE opt_if_not_exists:ifNotExists table_name:name + LPAREN column_definition_list:columns COMMA index_definition_list:indexes RPAREN opt_engine:engineName + opt_keys:keys + opt_comment:tableComment + opt_partition:partition + opt_distribution:distribution + opt_properties:tblProperties + opt_ext_properties:extProperties + {: + RESULT = new CreateTableStmt(ifNotExists, isExternal, name, columns, indexes, engineName, keys, partition, + distribution, tblProperties, extProperties, tableComment); :} /* User */ | KW_CREATE KW_USER opt_if_not_exists:ifNotExists grant_user:user opt_user_role:userRole @@ -931,6 +955,10 @@ create_stmt ::= {: RESULT = new CreateMaterializedViewStmt(mvName, selectStmt, properties); :} + | KW_CREATE KW_INDEX ident:indexName KW_ON table_name:tableName LPAREN ident_list:cols RPAREN opt_index_type:indexType opt_comment:comment + {: + RESULT = new AlterTableStmt(tableName, Lists.newArrayList(new CreateIndexClause(tableName, new IndexDef(indexName, cols, indexType, comment), false))); + :} ; opt_aggregate ::= @@ -1416,6 +1444,10 @@ drop_stmt ::= {: RESULT = new DropFileStmt(fileName, dbName, properties); :} + | KW_DROP KW_INDEX ident:indexName KW_ON table_name:tableName + {: + RESULT = new AlterTableStmt(tableName, Lists.newArrayList(new DropIndexClause(indexName, tableName, false))); + :} ; // Recover statement @@ -1678,6 +1710,19 @@ column_definition_list ::= :} ; +index_definition_list ::= + index_definition:index + {: + RESULT = Lists.newArrayList(); + RESULT.add(index); + :} + | index_definition_list:list COMMA index_definition:index + {: + list.add(index); + RESULT = list; + :} + ; + opt_default_value ::= /* Empty */ {: @@ -1711,6 +1756,13 @@ column_definition ::= :} ; +index_definition ::= + KW_INDEX ident:indexName LPAREN ident_list:cols RPAREN opt_index_type:indexType opt_comment:comment + {: + RESULT = new IndexDef(indexName, cols, indexType, comment); + :} + ; + opt_is_allow_null ::= {: RESULT = true; @@ -1736,6 +1788,16 @@ opt_comment ::= :} ; +opt_index_type ::= + {: + RESULT = null; + :} + | KW_USING KW_BITMAP + {: + RESULT = IndexDef.IndexType.BITMAP; + :} + ; + opt_if_exists ::= {: RESULT = false; @@ -1877,6 +1939,11 @@ show_param ::= {: RESULT = new ShowDbStmt(parser.wild, parser.where); :} + /* Dynamic Partition */ + | KW_DYNAMIC KW_PARTITION KW_TABLES opt_db:db + {: + RESULT = new ShowDynamicPartitionStmt(db); + :} /* Columns */ | opt_full KW_COLUMNS from_or_in table_name:table opt_db:db opt_wild_where {: @@ -1956,9 +2023,9 @@ show_param ::= {: RESULT = new ShowDataStmt(dbTblName.getDb(), dbTblName.getTbl()); :} - | KW_PARTITIONS KW_FROM table_name:tblName opt_partition_name:partitionName + | KW_PARTITIONS KW_FROM table_name:tblName opt_wild_where order_by_clause:orderByClause limit_clause: limitClause {: - RESULT = new ShowPartitionsStmt(tblName, partitionName); + RESULT = new ShowPartitionsStmt(tblName, parser.where, orderByClause, limitClause); :} | KW_TABLET INTEGER_LITERAL:tabletId {: @@ -2028,6 +2095,14 @@ show_param ::= {: RESULT = new ShowSmallFilesStmt(dbName); :} + | KW_INDEX KW_FROM table_name:dbTblName opt_db:dbName + {: + RESULT = new ShowIndexStmt(dbName, dbTblName); + :} + | KW_INDEXES KW_FROM table_name:tableName opt_db:dbName + {: + RESULT = new ShowIndexStmt(dbName, tableName); + :} ; keys_or_index ::= @@ -4271,6 +4346,8 @@ keyword ::= {: RESULT = id; :} | KW_STOP:id {: RESULT = id; :} + | KW_DYNAMIC:id + {: RESULT = id; :} ; // Identifier that contain keyword diff --git a/fe/src/main/java/org/apache/doris/PaloFe.java b/fe/src/main/java/org/apache/doris/PaloFe.java index d4ad8f56c7d48b..81217f9c12f772 100644 --- a/fe/src/main/java/org/apache/doris/PaloFe.java +++ b/fe/src/main/java/org/apache/doris/PaloFe.java @@ -91,7 +91,7 @@ public static void main(String[] args) { // 1. QeService for MySQL Server // 2. FeServer for Thrift Server // 3. HttpServer for HTTP Server - QeService qeService = new QeService(Config.query_port, ExecuteEnv.getInstance().getScheduler()); + QeService qeService = new QeService(Config.query_port, Config.mysql_service_nio_enabled, ExecuteEnv.getInstance().getScheduler()); FeServer feServer = new FeServer(Config.rpc_port); HttpServer httpServer = new HttpServer(Config.http_port); httpServer.setup(); diff --git a/fe/src/main/java/org/apache/doris/alter/Alter.java b/fe/src/main/java/org/apache/doris/alter/Alter.java index 9089e52a0fc85f..27671060c49acb 100644 --- a/fe/src/main/java/org/apache/doris/alter/Alter.java +++ b/fe/src/main/java/org/apache/doris/alter/Alter.java @@ -27,10 +27,13 @@ import org.apache.doris.analysis.AlterTableStmt; import org.apache.doris.analysis.AlterViewStmt; import org.apache.doris.analysis.ColumnRenameClause; +import org.apache.doris.analysis.CreateIndexClause; import org.apache.doris.analysis.CreateMaterializedViewStmt; import org.apache.doris.analysis.DropColumnClause; +import org.apache.doris.analysis.DropIndexClause; import org.apache.doris.analysis.DropPartitionClause; import org.apache.doris.analysis.DropRollupClause; +import org.apache.doris.analysis.IndexDef; import org.apache.doris.analysis.ModifyColumnClause; import org.apache.doris.analysis.ModifyPartitionClause; import org.apache.doris.analysis.ModifyTablePropertiesClause; @@ -42,6 +45,7 @@ import org.apache.doris.catalog.Catalog; import org.apache.doris.catalog.Column; import org.apache.doris.catalog.Database; +import org.apache.doris.catalog.Index; import org.apache.doris.catalog.OlapTable; import org.apache.doris.catalog.OlapTable.OlapTableState; import org.apache.doris.catalog.Table; @@ -52,6 +56,7 @@ import org.apache.doris.common.ErrorCode; import org.apache.doris.common.ErrorReport; import org.apache.doris.common.UserException; +import org.apache.doris.common.util.DynamicPartitionUtil; import org.apache.doris.persist.AlterViewInfo; import org.apache.doris.qe.ConnectContext; @@ -62,6 +67,8 @@ import java.util.Arrays; import java.util.List; +import java.util.Set; +import java.util.TreeSet; public class Alter { private static final Logger LOG = LogManager.getLogger(Alter.class); @@ -174,9 +181,50 @@ public void processAlterTable(AlterTableStmt stmt) throws UserException { || alterClause instanceof AddColumnsClause || alterClause instanceof DropColumnClause || alterClause instanceof ModifyColumnClause - || alterClause instanceof ReorderColumnsClause) + || alterClause instanceof ReorderColumnsClause + || alterClause instanceof CreateIndexClause + || alterClause instanceof DropIndexClause) && !hasAddMaterializedView && !hasDropRollup && !hasPartition && !hasRename) { hasSchemaChange = true; + if (alterClause instanceof CreateIndexClause) { + Table table = db.getTable(dbTableName.getTbl()); + if (!(table instanceof OlapTable)) { + throw new AnalysisException("create index only support in olap table at current version."); + } + List indexes = ((OlapTable) table).getIndexes(); + IndexDef indexDef = ((CreateIndexClause) alterClause).getIndexDef(); + Set newColset = new TreeSet<>(String.CASE_INSENSITIVE_ORDER); + newColset.addAll(indexDef.getColumns()); + for (Index idx : indexes) { + if (idx.getIndexName().equalsIgnoreCase(indexDef.getIndexName())) { + throw new AnalysisException("index `" + indexDef.getIndexName() + "` already exist."); + } + Set idxSet = new TreeSet<>(String.CASE_INSENSITIVE_ORDER); + idxSet.addAll(idx.getColumns()); + if (newColset.equals(idxSet)) { + throw new AnalysisException("index for columns (" + String + .join(",", indexDef.getColumns()) + " ) already exist."); + } + } + + } else if (alterClause instanceof DropIndexClause) { + Table table = db.getTable(dbTableName.getTbl()); + if (!(table instanceof OlapTable)) { + throw new AnalysisException("drop index only support in olap table at current version."); + } + String indexName = ((DropIndexClause) alterClause).getIndexName(); + List indexes = ((OlapTable) table).getIndexes(); + Index found = null; + for (Index idx : indexes) { + if (idx.getIndexName().equalsIgnoreCase(indexName)) { + found = idx; + break; + } + } + if (found == null) { + throw new AnalysisException("index " + indexName + " does not exist"); + } + } } else if ((alterClause instanceof AddRollupClause) && !hasSchemaChange && !hasAddMaterializedView && !hasDropRollup && !hasPartition && !hasRename && !hasModifyProp) { @@ -251,8 +299,10 @@ public void processAlterTable(AlterTableStmt stmt) throws UserException { Preconditions.checkState(alterClauses.size() == 1); AlterClause alterClause = alterClauses.get(0); if (alterClause instanceof DropPartitionClause) { + DynamicPartitionUtil.checkAlterAllowed(olapTable); Catalog.getInstance().dropPartition(db, olapTable, ((DropPartitionClause) alterClause)); } else if (alterClause instanceof ModifyPartitionClause) { + DynamicPartitionUtil.checkAlterAllowed(olapTable); Catalog.getInstance().modifyPartition(db, olapTable, ((ModifyPartitionClause) alterClause)); } else { hasAddPartition = true; @@ -269,6 +319,7 @@ public void processAlterTable(AlterTableStmt stmt) throws UserException { Preconditions.checkState(alterClauses.size() == 1); AlterClause alterClause = alterClauses.get(0); if (alterClause instanceof AddPartitionClause) { + DynamicPartitionUtil.checkAlterAllowed((OlapTable) db.getTable(tableName)); Catalog.getInstance().addPartition(db, tableName, (AddPartitionClause) alterClause); } else { Preconditions.checkState(false); diff --git a/fe/src/main/java/org/apache/doris/alter/RollupJobV2.java b/fe/src/main/java/org/apache/doris/alter/RollupJobV2.java index d43c177a481459..2b4cda75ca6887 100644 --- a/fe/src/main/java/org/apache/doris/alter/RollupJobV2.java +++ b/fe/src/main/java/org/apache/doris/alter/RollupJobV2.java @@ -194,7 +194,8 @@ protected void runPendingJob() throws AlterCancelException { rollupShortKeyColumnCount, rollupSchemaHash, Partition.PARTITION_INIT_VERSION, Partition.PARTITION_INIT_VERSION_HASH, rollupKeysType, TStorageType.COLUMN, storageMedium, - rollupSchema, tbl.getCopiedBfColumns(), tbl.getBfFpp(), countDownLatch); + rollupSchema, tbl.getCopiedBfColumns(), tbl.getBfFpp(), countDownLatch, + tbl.getCopiedIndexes()); createReplicaTask.setBaseTablet(tabletIdMap.get(rollupTabletId), baseSchemaHash); if (this.storageFormat != null) { createReplicaTask.setStorageFormat(this.storageFormat); diff --git a/fe/src/main/java/org/apache/doris/alter/SchemaChangeHandler.java b/fe/src/main/java/org/apache/doris/alter/SchemaChangeHandler.java index 9dbeadafa2f077..92edff6421ca02 100644 --- a/fe/src/main/java/org/apache/doris/alter/SchemaChangeHandler.java +++ b/fe/src/main/java/org/apache/doris/alter/SchemaChangeHandler.java @@ -24,7 +24,9 @@ import org.apache.doris.analysis.CancelAlterTableStmt; import org.apache.doris.analysis.CancelStmt; import org.apache.doris.analysis.ColumnPosition; +import org.apache.doris.analysis.CreateIndexClause; import org.apache.doris.analysis.DropColumnClause; +import org.apache.doris.analysis.DropIndexClause; import org.apache.doris.analysis.ModifyColumnClause; import org.apache.doris.analysis.ModifyTablePropertiesClause; import org.apache.doris.analysis.ReorderColumnsClause; @@ -35,6 +37,7 @@ import org.apache.doris.catalog.DistributionInfo; import org.apache.doris.catalog.DistributionInfo.DistributionInfoType; import org.apache.doris.catalog.HashDistributionInfo; +import org.apache.doris.catalog.Index; import org.apache.doris.catalog.KeysType; import org.apache.doris.catalog.MaterializedIndex; import org.apache.doris.catalog.MaterializedIndex.IndexExtState; @@ -57,6 +60,7 @@ import org.apache.doris.common.ErrorReport; import org.apache.doris.common.FeConstants; import org.apache.doris.common.UserException; +import org.apache.doris.common.util.DynamicPartitionUtil; import org.apache.doris.common.util.ListComparator; import org.apache.doris.common.util.PropertyAnalyzer; import org.apache.doris.common.util.Util; @@ -80,6 +84,7 @@ import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; +import java.util.HashSet; import java.util.Iterator; import java.util.LinkedList; import java.util.List; @@ -738,11 +743,11 @@ private void checkAssignedTargetIndexName(String baseIndexName, String targetInd } private void createJob(long dbId, OlapTable olapTable, Map> indexSchemaMap, - Map propertyMap) throws UserException { + Map propertyMap, List indexes) throws UserException { if (olapTable.getState() == OlapTableState.ROLLUP) { throw new DdlException("Table[" + olapTable.getName() + "]'s is doing ROLLUP job"); } - + if (this.hasUnfinishedAlterJob(olapTable.getId())) { throw new DdlException("Table[" + olapTable.getName() + "]'s is doing ALTER job"); } @@ -779,6 +784,14 @@ private void createJob(long dbId, OlapTable olapTable, Map newSet = new HashSet<>(indexes); + Set oriSet = new HashSet<>(olapTable.getIndexes()); + if (!newSet.equals(oriSet)) { + hasIndexChange = true; + } + // property 2. bloom filter // eg. "bloom_filter_columns" = "k1,k2", "bloom_filter_fpp" = "0.05" Set bfColumns = null; @@ -848,6 +861,7 @@ private void createJob(long dbId, OlapTable olapTable, Map alterClauses, String clusterName, Database for (Map.Entry> entry : olapTable.getIndexIdToSchema().entrySet()) { indexSchemaMap.put(entry.getKey(), new LinkedList(entry.getValue())); } - + List newIndexes = olapTable.getCopiedIndexes(); Map propertyMap = new HashMap(); for (AlterClause alterClause : alterClauses) { // get properties @@ -1326,6 +1342,9 @@ public void process(List alterClauses, String clusterName, Database */ sendClearAlterTask(db, olapTable); return; + } else if (DynamicPartitionUtil.checkDynamicPartitionPropertiesExist(properties)) { + Catalog.getCurrentCatalog().modifyTableDynamicPartition(db, olapTable, properties); + return; } } @@ -1347,12 +1366,16 @@ public void process(List alterClauses, String clusterName, Database } else if (alterClause instanceof ModifyTablePropertiesClause) { // modify table properties // do nothing, properties are already in propertyMap + } else if (alterClause instanceof CreateIndexClause) { + processAddIndex((CreateIndexClause) alterClause, newIndexes); + } else if (alterClause instanceof DropIndexClause) { + processDropIndex((DropIndexClause) alterClause, newIndexes); } else { Preconditions.checkState(false); } } // end for alter clauses - createJob(db.getId(), olapTable, indexSchemaMap, propertyMap); + createJob(db.getId(), olapTable, indexSchemaMap, propertyMap, newIndexes); } private void sendClearAlterTask(Database db, OlapTable olapTable) { @@ -1438,4 +1461,21 @@ public void cancel(CancelStmt stmt) throws DdlException { jobDone(schemaChangeJob); } } + + private void processAddIndex(CreateIndexClause alterClause, List indexes) { + if (alterClause.getIndex() != null) { + indexes.add(alterClause.getIndex()); + } + } + + private void processDropIndex(DropIndexClause alterClause, List indexes) { + Iterator itr = indexes.iterator(); + while (itr.hasNext()) { + Index idx = itr.next(); + if (idx.getIndexName().equalsIgnoreCase(alterClause.getIndexName())) { + itr.remove(); + break; + } + } + } } diff --git a/fe/src/main/java/org/apache/doris/alter/SchemaChangeJobV2.java b/fe/src/main/java/org/apache/doris/alter/SchemaChangeJobV2.java index 29557caa984584..ab4f1ea81ad61f 100644 --- a/fe/src/main/java/org/apache/doris/alter/SchemaChangeJobV2.java +++ b/fe/src/main/java/org/apache/doris/alter/SchemaChangeJobV2.java @@ -20,6 +20,7 @@ import org.apache.doris.catalog.Catalog; import org.apache.doris.catalog.Column; import org.apache.doris.catalog.Database; +import org.apache.doris.catalog.Index; import org.apache.doris.catalog.MaterializedIndex; import org.apache.doris.catalog.MaterializedIndex.IndexState; import org.apache.doris.catalog.OlapTable; @@ -32,6 +33,7 @@ import org.apache.doris.catalog.TabletMeta; import org.apache.doris.common.Config; import org.apache.doris.common.FeConstants; +import org.apache.doris.common.FeMetaVersion; import org.apache.doris.common.MarkedCountDownLatch; import org.apache.doris.common.Pair; import org.apache.doris.common.io.Text; @@ -42,10 +44,10 @@ import org.apache.doris.task.AgentTaskQueue; import org.apache.doris.task.AlterReplicaTask; import org.apache.doris.task.CreateReplicaTask; +import org.apache.doris.thrift.TStorageFormat; import org.apache.doris.thrift.TStorageMedium; import org.apache.doris.thrift.TStorageType; import org.apache.doris.thrift.TTaskType; -import org.apache.doris.thrift.TStorageFormat; import com.google.common.base.Joiner; import com.google.common.base.Preconditions; @@ -56,12 +58,14 @@ import com.google.common.collect.Table; import com.google.common.collect.Table.Cell; +import org.apache.commons.collections.CollectionUtils; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import java.io.DataInput; import java.io.DataOutput; import java.io.IOException; +import java.util.ArrayList; import java.util.List; import java.util.Map; import java.util.Map.Entry; @@ -96,6 +100,10 @@ public class SchemaChangeJobV2 extends AlterJobV2 { private Set bfColumns = null; private double bfFpp = 0; + // alter index info + private boolean indexChange = false; + private List indexes = null; + // The schema change job will wait all transactions before this txn id finished, then send the schema change tasks. protected long watershedTxnId = -1; @@ -142,6 +150,11 @@ public void setBloomFilterInfo(boolean hasBfChange, Set bfColumns, doubl this.bfFpp = bfFpp; } + public void setAlterIndexInfo(boolean indexChange, List indexes) { + this.indexChange = indexChange; + this.indexes = indexes; + } + public void setStorageFormat(TStorageFormat storageFormat) { this.storageFormat = storageFormat; } @@ -207,12 +220,12 @@ protected void runPendingJob() throws AlterCancelException { shadowShortKeyColumnCount, shadowSchemaHash, Partition.PARTITION_INIT_VERSION, Partition.PARTITION_INIT_VERSION_HASH, tbl.getKeysType(), TStorageType.COLUMN, storageMedium, - shadowSchema, bfColumns, bfFpp, countDownLatch); + shadowSchema, bfColumns, bfFpp, countDownLatch, indexes); createReplicaTask.setBaseTablet(partitionIndexTabletMap.get(partitionId, shadowIdxId).get(shadowTabletId), originSchemaHash); if (this.storageFormat != null) { createReplicaTask.setStorageFormat(this.storageFormat); } - + batchTask.addTask(createReplicaTask); } // end for rollupReplicas } // end for rollupTablets @@ -533,6 +546,10 @@ private void onFinished(OlapTable tbl) { if (hasBfChange) { tbl.setBloomFilterInfo(bfColumns, bfFpp); } + // update index + if (indexChange) { + tbl.setIndexes(indexes); + } tbl.setState(OlapTableState.NORMAL); } @@ -838,6 +855,18 @@ public void write(DataOutput out) throws IOException { } out.writeLong(watershedTxnId); + + // index + out.writeBoolean(indexChange); + if (CollectionUtils.isNotEmpty(indexes)) { + out.writeBoolean(true); + out.writeInt(indexes.size()); + for (Index index : indexes) { + index.write(out); + } + } else { + out.writeBoolean(false); + } } public void readFields(DataInput in) throws IOException { @@ -899,5 +928,21 @@ public void readFields(DataInput in) throws IOException { } watershedTxnId = in.readLong(); + + // index + if (Catalog.getCurrentCatalogJournalVersion() >= FeMetaVersion.VERSION_70) { + indexChange = in.readBoolean(); + if (indexChange) { + if (in.readBoolean()) { + int indexCount = in.readInt(); + this.indexes = new ArrayList<>(); + for (int i = 0; i < indexCount; ++i) { + this.indexes.add(Index.read(in)); + } + } else { + this.indexes = null; + } + } + } } } diff --git a/fe/src/main/java/org/apache/doris/analysis/AlterTableStmt.java b/fe/src/main/java/org/apache/doris/analysis/AlterTableStmt.java index f58dd8c5a52283..3169575c552cf4 100644 --- a/fe/src/main/java/org/apache/doris/analysis/AlterTableStmt.java +++ b/fe/src/main/java/org/apache/doris/analysis/AlterTableStmt.java @@ -18,7 +18,6 @@ package org.apache.doris.analysis; import org.apache.doris.catalog.Catalog; -import org.apache.doris.common.AnalysisException; import org.apache.doris.common.ErrorCode; import org.apache.doris.common.ErrorReport; import org.apache.doris.common.UserException; @@ -51,29 +50,29 @@ public List getOps() { @Override - public void analyze(Analyzer analyzer) throws AnalysisException, UserException { + public void analyze(Analyzer analyzer) throws UserException { super.analyze(analyzer); if (tbl == null) { ErrorReport.reportAnalysisException(ErrorCode.ERR_NO_TABLES_USED); } tbl.analyze(analyzer); + if (!Catalog.getCurrentCatalog().getAuth().checkTblPriv(ConnectContext.get(), tbl.getDb(), tbl.getTbl(), + PrivPredicate.ALTER)) { + ErrorReport.reportAnalysisException(ErrorCode.ERR_TABLEACCESS_DENIED_ERROR, "ALTER TABLE", + ConnectContext.get().getQualifiedUser(), + ConnectContext.get().getRemoteIP(), + tbl.getTbl()); + } if (ops == null || ops.isEmpty()) { ErrorReport.reportAnalysisException(ErrorCode.ERR_NO_ALTER_OPERATION); } for (AlterClause op : ops) { op.analyze(analyzer); } - - if (!Catalog.getCurrentCatalog().getAuth().checkTblPriv(ConnectContext.get(), tbl.getDb(), tbl.getTbl(), - PrivPredicate.ALTER)) { - ErrorReport.reportAnalysisException(ErrorCode.ERR_TABLEACCESS_DENIED_ERROR, "ALTER TABLE", - ConnectContext.get().getQualifiedUser(), - ConnectContext.get().getRemoteIP(), - tbl.getTbl()); - } } @Override + public String toSql() { StringBuilder sb = new StringBuilder(); sb.append("ALTER TABLE ").append(tbl.toSql()).append(" "); diff --git a/fe/src/main/java/org/apache/doris/analysis/CreateIndexClause.java b/fe/src/main/java/org/apache/doris/analysis/CreateIndexClause.java new file mode 100644 index 00000000000000..97187f4c79bf7c --- /dev/null +++ b/fe/src/main/java/org/apache/doris/analysis/CreateIndexClause.java @@ -0,0 +1,84 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.analysis; + +import org.apache.doris.catalog.Index; +import org.apache.doris.common.AnalysisException; + +import com.google.common.collect.Maps; + +import java.util.Map; + +public class CreateIndexClause extends AlterTableClause { + // in which table the index on, only used when alter = false + private TableName tableName; + // index definition class + private IndexDef indexDef; + // when alter = true, clause like: alter table add index xxxx + // when alter = false, clause like: create index xx on table xxxx + private boolean alter; + // index internal class + private Index index; + + public CreateIndexClause(TableName tableName, IndexDef indexDef, boolean alter) { + this.tableName = tableName; + this.indexDef = indexDef; + this.alter = alter; + this.needTableStable = true; + } + + @Override + public Map getProperties() { + return Maps.newHashMap(); + } + + public Index getIndex() { + return index; + } + + public IndexDef getIndexDef() { + return indexDef; + } + + public boolean isAlter() { + return alter; + } + + public TableName getTableName() { + return tableName; + } + + @Override + public void analyze(Analyzer analyzer) throws AnalysisException { + if (indexDef == null) { + throw new AnalysisException("index definition expected."); + } + indexDef.analyze(); + this.index = new Index(indexDef.getIndexName(), indexDef.getColumns(), indexDef.getIndexType(), + indexDef.getComment()); + } + + @Override + public String toSql() { + if (alter) { + return indexDef.toSql(); + } else { + return "CREATE " + indexDef.toSql(tableName.toSql()); + } + } +} diff --git a/fe/src/main/java/org/apache/doris/analysis/CreateTableStmt.java b/fe/src/main/java/org/apache/doris/analysis/CreateTableStmt.java index 92b0e467681925..00c4590f3523b0 100644 --- a/fe/src/main/java/org/apache/doris/analysis/CreateTableStmt.java +++ b/fe/src/main/java/org/apache/doris/analysis/CreateTableStmt.java @@ -22,8 +22,10 @@ import org.apache.doris.catalog.AggregateType; import org.apache.doris.catalog.Catalog; import org.apache.doris.catalog.Column; +import org.apache.doris.catalog.Index; import org.apache.doris.catalog.KeysType; import org.apache.doris.catalog.PartitionType; +import org.apache.doris.catalog.PrimitiveType; import org.apache.doris.common.AnalysisException; import org.apache.doris.common.Config; import org.apache.doris.common.ErrorCode; @@ -42,14 +44,18 @@ import com.google.common.collect.Lists; import com.google.common.collect.Sets; +import org.apache.commons.collections.CollectionUtils; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; import java.io.DataInput; import java.io.IOException; +import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Set; +import java.util.TreeSet; +import java.util.stream.Collectors; public class CreateTableStmt extends DdlStmt { private static final Logger LOG = LogManager.getLogger(CreateTableStmt.class); @@ -60,6 +66,7 @@ public class CreateTableStmt extends DdlStmt { private boolean isExternal; private TableName tableName; private List columnDefs; + private List indexDefs; private KeysDesc keysDesc; private PartitionDesc partitionDesc; private DistributionDesc distributionDesc; @@ -73,6 +80,8 @@ public class CreateTableStmt extends DdlStmt { // set in analyze private List columns = Lists.newArrayList(); + private List indexes = Lists.newArrayList(); + static { engineNames = Sets.newHashSet(); engineNames.add("olap"); @@ -95,7 +104,23 @@ public CreateTableStmt(boolean ifNotExists, boolean isExternal, TableName tableName, List columnDefinitions, - String engineName, + String engineName, + KeysDesc keysDesc, + PartitionDesc partitionDesc, + DistributionDesc distributionDesc, + Map properties, + Map extProperties, + String comment) { + this(ifNotExists, isExternal, tableName, columnDefinitions, null, engineName, keysDesc, partitionDesc, + distributionDesc, properties, extProperties, comment); + } + + public CreateTableStmt(boolean ifNotExists, + boolean isExternal, + TableName tableName, + List columnDefinitions, + List indexDefs, + String engineName, KeysDesc keysDesc, PartitionDesc partitionDesc, DistributionDesc distributionDesc, @@ -108,6 +133,7 @@ public CreateTableStmt(boolean ifNotExists, } else { this.columnDefs = columnDefinitions; } + this.indexDefs = indexDefs; if (Strings.isNullOrEmpty(engineName)) { this.engineName = DEFAULT_ENGINE_NAME; } else { @@ -192,14 +218,18 @@ public String getComment() { return comment; } + public List getIndexes() { + return indexes; + } + @Override - public void analyze(Analyzer analyzer) throws AnalysisException, UserException { + public void analyze(Analyzer analyzer) throws UserException { super.analyze(analyzer); tableName.analyze(analyzer); FeNameFormat.checkTableName(tableName.getTbl()); if (!Catalog.getCurrentCatalog().getAuth().checkTblPriv(ConnectContext.get(), tableName.getDb(), - tableName.getTbl(), PrivPredicate.CREATE)) { + tableName.getTbl(), PrivPredicate.CREATE)) { ErrorReport.reportAnalysisException(ErrorCode.ERR_SPECIFIC_ACCESS_DENIED_ERROR, "CREATE"); } @@ -362,6 +392,66 @@ public void analyze(Analyzer analyzer) throws AnalysisException, UserException { } columns.add(col); } + + if (CollectionUtils.isNotEmpty(indexDefs)) { + Set distinct = new TreeSet<>(String.CASE_INSENSITIVE_ORDER); + Set> distinctCol = new HashSet<>(); + + for (IndexDef indexDef : indexDefs) { + indexDef.analyze(); + if (!engineName.equalsIgnoreCase("olap")) { + throw new AnalysisException("index only support in olap engine at current version."); + } + for (String indexColName : indexDef.getColumns()) { + indexColName = indexColName.trim(); + boolean found = false; + for (Column column : columns) { + if (column.getName().equalsIgnoreCase(indexColName)) { + indexColName = column.getName(); + PrimitiveType colType = column.getDataType(); + + // key columns and none/replace aggregate non-key columns support + if (indexDef.getIndexType() == IndexDef.IndexType.BITMAP) { + if (!(colType == PrimitiveType.TINYINT || colType == PrimitiveType.SMALLINT + || colType == PrimitiveType.INT || colType == PrimitiveType.BIGINT || + colType == PrimitiveType.CHAR || colType == PrimitiveType.VARCHAR)) { + throw new AnalysisException(colType + " is not supported in bitmap index. " + + "invalid column: " + indexColName); + } else if (column.isKey() + || column.getAggregationType() == AggregateType.NONE + || column.getAggregationType() == AggregateType.REPLACE + || column.getAggregationType() == AggregateType.REPLACE_IF_NOT_NULL) { + found = true; + break; + } else { + // althrough the implemention supports bf for replace non-key column, + // for simplicity and unity, we don't expose that to user. + throw new AnalysisException( + "BITMAP index only used in columns of DUP_KEYS table or " + + "key columns of UNIQUE_KEYS/AGG_KEYS table. invalid column: " + + indexColName); + } + } + } + } + + if (!found) { + throw new AnalysisException("BITMAP column does not exist in table. invalid column: " + + indexColName); + } + } + indexes.add(new Index(indexDef.getIndexName(), indexDef.getColumns(), indexDef.getIndexType(), + indexDef.getComment())); + distinct.add(indexDef.getIndexName()); + distinctCol.add(indexDef.getColumns().stream().map(String::toUpperCase).collect(Collectors.toList())); + } + if (distinct.size() != indexes.size()) { + throw new AnalysisException("index name must be unique."); + } + if (distinctCol.size() != indexes.size()) { + throw new AnalysisException("same index columns have multiple index name is not allowed."); + } + } } private void analyzeEngineName() throws AnalysisException { @@ -411,6 +501,12 @@ public String toSql() { sb.append(" ").append(columnDef.toSql()); idx++; } + if (CollectionUtils.isNotEmpty(indexDefs)) { + sb.append(",\n"); + for (IndexDef indexDef : indexDefs) { + sb.append(" ").append(indexDef.toSql()); + } + } sb.append("\n)"); if (engineName != null) { sb.append(" ENGINE = ").append(engineName); diff --git a/fe/src/main/java/org/apache/doris/analysis/DropIndexClause.java b/fe/src/main/java/org/apache/doris/analysis/DropIndexClause.java new file mode 100644 index 00000000000000..40d566549af7a4 --- /dev/null +++ b/fe/src/main/java/org/apache/doris/analysis/DropIndexClause.java @@ -0,0 +1,72 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.analysis; + +import org.apache.doris.common.AnalysisException; +import org.apache.doris.common.UserException; + +import org.apache.commons.lang.StringUtils; + +import java.util.Map; + +public class DropIndexClause extends AlterTableClause { + private final String indexName; + private final TableName tableName; + private boolean alter; + + public DropIndexClause(String indexName, TableName tableName, boolean alter) { + this.indexName = indexName; + this.tableName = tableName; + this.alter = alter; + this.needTableStable = true; + } + + public String getIndexName() { + return indexName; + } + + public TableName getTableName() { + return tableName; + } + + public boolean isAlter() { + return alter; + } + + @Override + public Map getProperties() { + return null; + } + + @Override + public void analyze(Analyzer analyzer) throws UserException { + if (StringUtils.isEmpty(indexName)) { + throw new AnalysisException("index name is excepted"); + } + } + + @Override + public String toSql() { + StringBuilder stringBuilder = new StringBuilder(); + stringBuilder.append("DROP INDEX ").append(indexName); + if (!alter) { + stringBuilder.append(" ON ").append(tableName.toSql()); + } + return stringBuilder.toString(); + } +} diff --git a/fe/src/main/java/org/apache/doris/analysis/IndexDef.java b/fe/src/main/java/org/apache/doris/analysis/IndexDef.java new file mode 100644 index 00000000000000..6023790181a1ea --- /dev/null +++ b/fe/src/main/java/org/apache/doris/analysis/IndexDef.java @@ -0,0 +1,121 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.analysis; + +import org.apache.doris.common.AnalysisException; + +import com.google.common.base.Strings; + +import java.util.List; +import java.util.TreeSet; + +public class IndexDef { + private String indexName; + private List columns; + private IndexType indexType; + private String comment; + + public IndexDef(String indexName, List columns, IndexType indexType, String comment) { + this.indexName = indexName; + this.columns = columns; + if (indexType == null) { + this.indexType = IndexType.BITMAP; + } else { + this.indexType = indexType; + } + if (columns == null) { + this.comment = ""; + } else { + this.comment = comment; + } + } + + public void analyze() throws AnalysisException { + if (indexType == IndexDef.IndexType.BITMAP) { + if (columns == null || columns.size() != 1) { + throw new AnalysisException("bitmap index definition expect at least one column."); + } + if (Strings.isNullOrEmpty(indexName)) { + throw new AnalysisException("index name cannot be blank."); + } + if (indexName.length() > 64) { + throw new AnalysisException("index name too long, the index name length at most is 64."); + } + TreeSet distinct = new TreeSet<>(String.CASE_INSENSITIVE_ORDER); + distinct.addAll(columns); + if (columns.size() != distinct.size()) { + throw new AnalysisException("columns of index has duplicated."); + } + } + } + + public String toSql() { + return toSql(null); + } + + public String toSql(String tableName) { + StringBuilder sb = new StringBuilder("INDEX "); + sb.append(indexName); + if (tableName != null && !tableName.isEmpty()) { + sb.append(" ON ").append(tableName); + } + sb.append(" ("); + boolean first = true; + for (String col : columns) { + if (first) { + first = false; + } else { + sb.append(","); + } + sb.append("`" + col + "`"); + } + sb.append(")"); + if (indexType != null) { + sb.append(" USING ").append(indexType.toString()); + } + if (comment != null) { + sb.append(" COMMENT '" + comment + "'"); + } + return sb.toString(); + } + + @Override + public String toString() { + return toSql(); + } + + public String getIndexName() { + return indexName; + } + + public List getColumns() { + return columns; + } + + public IndexType getIndexType() { + return indexType; + } + + public String getComment() { + return comment; + } + + public enum IndexType { + BITMAP, + } +} diff --git a/fe/src/main/java/org/apache/doris/analysis/ModifyTablePropertiesClause.java b/fe/src/main/java/org/apache/doris/analysis/ModifyTablePropertiesClause.java index c7678ddc36ebb9..e9b30185c04037 100644 --- a/fe/src/main/java/org/apache/doris/analysis/ModifyTablePropertiesClause.java +++ b/fe/src/main/java/org/apache/doris/analysis/ModifyTablePropertiesClause.java @@ -17,8 +17,10 @@ package org.apache.doris.analysis; +import org.apache.doris.catalog.TableProperty; import org.apache.doris.common.AnalysisException; import org.apache.doris.common.Config; +import org.apache.doris.common.util.DynamicPartitionUtil; import org.apache.doris.common.util.PrintableMap; import org.apache.doris.common.util.PropertyAnalyzer; @@ -39,7 +41,8 @@ public void analyze(Analyzer analyzer) throws AnalysisException { throw new AnalysisException("Properties is not set"); } - if (properties.size() != 1) { + if (properties.size() != 1 + && !TableProperty.isSamePrefixProperties(properties, TableProperty.DYNAMIC_PARTITION_PROPERTY_PREFIX)) { throw new AnalysisException("Can only set one table property at a time"); } @@ -71,6 +74,8 @@ public void analyze(Analyzer analyzer) throws AnalysisException { throw new AnalysisException( "Property " + PropertyAnalyzer.PROPERTIES_STORAGE_FORMAT + " should be v2"); } + } else if (DynamicPartitionUtil.checkDynamicPartitionPropertiesExist(properties)) { + // do nothing, dynamic properties will be analyzed in SchemaChangeHandler.process } else { throw new AnalysisException("Unknown table property: " + properties.keySet()); } diff --git a/fe/src/main/java/org/apache/doris/analysis/ShowDynamicPartitionStmt.java b/fe/src/main/java/org/apache/doris/analysis/ShowDynamicPartitionStmt.java new file mode 100644 index 00000000000000..30bfaa47f97394 --- /dev/null +++ b/fe/src/main/java/org/apache/doris/analysis/ShowDynamicPartitionStmt.java @@ -0,0 +1,87 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.analysis; + +import com.google.common.base.Strings; +import org.apache.doris.catalog.Column; +import org.apache.doris.catalog.ScalarType; +import org.apache.doris.cluster.ClusterNamespace; +import org.apache.doris.common.AnalysisException; +import org.apache.doris.common.ErrorCode; +import org.apache.doris.common.ErrorReport; +import org.apache.doris.qe.ShowResultSetMetaData; + +public class ShowDynamicPartitionStmt extends ShowStmt { + private String db; + private static final ShowResultSetMetaData SHOW_DYNAMIC_PARTITION_META_DATA = + ShowResultSetMetaData.builder() + .addColumn(new Column("TableName", ScalarType.createVarchar(20))) + .addColumn(new Column("Enable", ScalarType.createVarchar(20))) + .addColumn(new Column("TimeUnit", ScalarType.createVarchar(20))) + .addColumn(new Column("End", ScalarType.createVarchar(20))) + .addColumn(new Column("Prefix", ScalarType.createVarchar(20))) + .addColumn(new Column("Buckets", ScalarType.createVarchar(20))) + .addColumn(new Column("LastUpdateTime", ScalarType.createVarchar(20))) + .addColumn(new Column("LastSchedulerTime", ScalarType.createVarchar(20))) + .addColumn(new Column("State", ScalarType.createVarchar(20))) + .addColumn(new Column("Msg", ScalarType.createVarchar(20))) + .build(); + + ShowDynamicPartitionStmt(String db) { + this.db = db; + } + + public String getDb() { + return db; + } + + @Override + public void analyze(Analyzer analyzer) throws AnalysisException { + if (Strings.isNullOrEmpty(db)) { + db = analyzer.getDefaultDb(); + if (Strings.isNullOrEmpty(db)) { + ErrorReport.reportAnalysisException(ErrorCode.ERR_NO_DB_ERROR); + } + } else { + db = ClusterNamespace.getFullName(analyzer.getClusterName(), db); + } + + // we do not check db privs here. because user may not have any db privs, + // but if it has privs of tbls inside this db,it should be allowed to see this db. + } + + @Override + public String toSql() { + StringBuilder sb = new StringBuilder(); + sb.append("SHOW DYNAMIC PARTITION TABLES"); + if (!Strings.isNullOrEmpty(db)) { + sb.append(" FROM ").append(db); + } + return sb.toString(); + } + + @Override + public String toString() { + return toSql(); + } + + @Override + public ShowResultSetMetaData getMetaData() { + return SHOW_DYNAMIC_PARTITION_META_DATA; + } +} \ No newline at end of file diff --git a/fe/src/main/java/org/apache/doris/analysis/ShowIndexStmt.java b/fe/src/main/java/org/apache/doris/analysis/ShowIndexStmt.java new file mode 100644 index 00000000000000..dd5237bedd99b7 --- /dev/null +++ b/fe/src/main/java/org/apache/doris/analysis/ShowIndexStmt.java @@ -0,0 +1,108 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.analysis; + +import org.apache.doris.catalog.Catalog; +import org.apache.doris.catalog.Column; +import org.apache.doris.catalog.ScalarType; +import org.apache.doris.cluster.ClusterNamespace; +import org.apache.doris.common.AnalysisException; +import org.apache.doris.common.ErrorCode; +import org.apache.doris.common.ErrorReport; +import org.apache.doris.common.UserException; +import org.apache.doris.mysql.privilege.PrivPredicate; +import org.apache.doris.qe.ConnectContext; +import org.apache.doris.qe.ShowResultSetMetaData; + +import com.google.common.base.Strings; + +public class ShowIndexStmt extends ShowStmt { + private static final ShowResultSetMetaData META_DATA = + ShowResultSetMetaData.builder() + .addColumn(new Column("Table", ScalarType.createVarchar(64))) + .addColumn(new Column("Index_name", ScalarType.createVarchar(10))) + .addColumn(new Column("Column_name", ScalarType.createVarchar(80))) + .addColumn(new Column("Index_type", ScalarType.createVarchar(64))) + .addColumn(new Column("Comment", ScalarType.createVarchar(80))) + .build(); + private String dbName; + private TableName tableName; + + public ShowIndexStmt(String dbName, TableName tableName) { + this.dbName = dbName; + this.tableName = tableName; + } + + @Override + public void analyze(Analyzer analyzer) throws AnalysisException, UserException { + super.analyze(analyzer); + if (Strings.isNullOrEmpty(tableName.getTbl())) { + ErrorReport.reportAnalysisException(ErrorCode.ERR_NO_TABLES_USED); + } + if (Strings.isNullOrEmpty(dbName) && Strings.isNullOrEmpty(tableName.getDb())) { + dbName = analyzer.getDefaultDb(); + tableName.setDb(dbName); + } else if (Strings.isNullOrEmpty(dbName) && !Strings.isNullOrEmpty(tableName.getDb())) { + dbName = tableName.getDb(); + } else if (!Strings.isNullOrEmpty(dbName) && Strings.isNullOrEmpty(tableName.getDb())) { + tableName.setDb(dbName); + } + if (!dbName.equalsIgnoreCase(tableName.getDb())) { + ErrorReport.reportAnalysisException(ErrorCode.ERR_WRONG_TABLE_NAME); + } + dbName = ClusterNamespace.getFullName(analyzer.getClusterName(), dbName); + + if (!Catalog.getCurrentCatalog().getAuth().checkDbPriv(ConnectContext.get(), dbName, PrivPredicate.SHOW)) { + ErrorReport.reportAnalysisException(ErrorCode.ERR_DB_ACCESS_DENIED, analyzer.getQualifiedUser(), dbName); + } + if (!Catalog.getCurrentCatalog().getAuth().checkTblPriv(ConnectContext.get(), dbName, dbName, + PrivPredicate.SHOW)) { + ErrorReport.reportAnalysisException(ErrorCode.ERR_TABLEACCESS_DENIED_ERROR, analyzer.getQualifiedUser(), + tableName.toString()); + } + } + + @Override + public String toSql() { + StringBuilder sb = new StringBuilder("SHOW INDEX FROM "); + sb.append(tableName.toSql()); + return sb.toString(); + } + + @Override + public String toString() { + return toSql(); + } + + public String getDbName() { + if (dbName != null) { + return dbName; + } else { + return tableName.getDb(); + } + } + + public TableName getTableName() { + return tableName; + } + + @Override + public ShowResultSetMetaData getMetaData() { + return META_DATA; + } +} diff --git a/fe/src/main/java/org/apache/doris/analysis/ShowPartitionsStmt.java b/fe/src/main/java/org/apache/doris/analysis/ShowPartitionsStmt.java index 8184fa4a94838c..c51c339ad9de5b 100644 --- a/fe/src/main/java/org/apache/doris/analysis/ShowPartitionsStmt.java +++ b/fe/src/main/java/org/apache/doris/analysis/ShowPartitionsStmt.java @@ -19,18 +19,21 @@ import org.apache.doris.catalog.Catalog; import org.apache.doris.catalog.Column; -import org.apache.doris.catalog.ScalarType; import org.apache.doris.catalog.Database; import org.apache.doris.catalog.OlapTable; +import org.apache.doris.catalog.ScalarType; import org.apache.doris.catalog.Table; +import org.apache.doris.catalog.Type; import org.apache.doris.cluster.ClusterNamespace; import org.apache.doris.common.AnalysisException; import org.apache.doris.common.ErrorCode; import org.apache.doris.common.ErrorReport; import org.apache.doris.common.UserException; +import org.apache.doris.common.proc.PartitionsProcDir; import org.apache.doris.common.proc.ProcNodeInterface; import org.apache.doris.common.proc.ProcResult; import org.apache.doris.common.proc.ProcService; +import org.apache.doris.common.util.OrderByPair; import org.apache.doris.mysql.privilege.PrivPredicate; import org.apache.doris.qe.ConnectContext; import org.apache.doris.qe.ShowResultSetMetaData; @@ -40,19 +43,41 @@ import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; +import java.util.ArrayList; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + public class ShowPartitionsStmt extends ShowStmt { private static final Logger LOG = LogManager.getLogger(ShowPartitionsStmt.class); + private static final String FILTER_PARTITION_ID = "PartitionId"; + private static final String FILTER_PARTITION_NAME = "PartitionName"; + private static final String FILTER_STATE = "State"; + private static final String FILTER_BUCKETS = "Buckets"; + private static final String FILTER_REPLICATION_NUM = "ReplicationNum"; + private static final String FILTER_LAST_CONSISTENCY_CHECK_TIME = "LastConsistencyCheckTime"; + private String dbName; private String tableName; - private String partitionName; + private Expr whereClause; + private List orderByElements; + private LimitElement limitElement; + private List orderByPairs; + private Map filterMap; private ProcNodeInterface node; - public ShowPartitionsStmt(TableName tableName, String partitionName) { + public ShowPartitionsStmt(TableName tableName, Expr whereClause, List orderByElements, + LimitElement limitElement) { this.dbName = tableName.getDb(); this.tableName = tableName.getTbl(); - this.partitionName = partitionName; + this.whereClause = whereClause; + this.orderByElements = orderByElements; + this.limitElement = limitElement; + if (whereClause != null) { + this.filterMap = new HashMap<>(); + } } public String getDbName() { @@ -63,8 +88,24 @@ public String getTableName() { return tableName; } - public String getPartitionName() { - return partitionName; + public Expr getWhereClause() { + return whereClause; + } + + public List getOrderByElements() { + return orderByElements; + } + + public List getOrderByPairs() { + return orderByPairs; + } + + public LimitElement getLimitElement() { + return limitElement; + } + + public Map getFilterMap() { + return filterMap; } public ProcNodeInterface getNode() { @@ -73,17 +114,7 @@ public ProcNodeInterface getNode() { @Override public void analyze(Analyzer analyzer) throws AnalysisException, UserException { - super.analyze(analyzer); - - if (Strings.isNullOrEmpty(dbName)) { - dbName = analyzer.getDefaultDb(); - if (Strings.isNullOrEmpty(dbName)) { - throw new AnalysisException("No db name in show data statement."); - } - } else { - dbName = ClusterNamespace.getFullName(getClusterName(), dbName); - } - + analyzeImpl(analyzer); // check access if (!Catalog.getCurrentCatalog().getAuth().checkTblPriv(ConnectContext.get(), dbName, tableName, PrivPredicate.SHOW)) { @@ -92,11 +123,11 @@ public void analyze(Analyzer analyzer) throws AnalysisException, UserException { ConnectContext.get().getRemoteIP(), tableName); } - Database db = Catalog.getInstance().getDb(dbName); if (db == null) { - throw new AnalysisException("Database[" + dbName + "] does not exist"); + ErrorReport.reportAnalysisException(ErrorCode.ERR_BAD_DB_ERROR, dbName); } + db.readLock(); try { Table table = db.getTable(tableName); @@ -122,6 +153,94 @@ public void analyze(Analyzer analyzer) throws AnalysisException, UserException { } } + public void analyzeImpl(Analyzer analyzer) throws UserException { + super.analyze(analyzer); + if (Strings.isNullOrEmpty(dbName)) { + dbName = analyzer.getDefaultDb(); + if (Strings.isNullOrEmpty(dbName)) { + ErrorReport.reportAnalysisException(ErrorCode.ERR_NO_DB_ERROR); + } + } else { + dbName = ClusterNamespace.getFullName(getClusterName(), dbName); + } + + // analyze where clause if not null + if (whereClause != null) { + analyzeSubPredicate(whereClause); + } + + // order by + if (orderByElements != null && !orderByElements.isEmpty()) { + orderByPairs = new ArrayList<>(); + for (OrderByElement orderByElement : orderByElements) { + if (!(orderByElement.getExpr() instanceof SlotRef)) { + throw new AnalysisException("Should order by column"); + } + SlotRef slotRef = (SlotRef) orderByElement.getExpr(); + int index = PartitionsProcDir.analyzeColumn(slotRef.getColumnName()); + OrderByPair orderByPair = new OrderByPair(index, !orderByElement.getIsAsc()); + orderByPairs.add(orderByPair); + } + } + + if (limitElement != null) { + limitElement.analyze(analyzer); + } + } + + private void analyzeSubPredicate(Expr subExpr) throws AnalysisException { + if (subExpr == null) { + return; + } + if (subExpr instanceof CompoundPredicate) { + CompoundPredicate cp = (CompoundPredicate) subExpr; + if (cp.getOp() != CompoundPredicate.Operator.AND) { + throw new AnalysisException("Only allow compound predicate with operator AND"); + } + analyzeSubPredicate(cp.getChild(0)); + analyzeSubPredicate(cp.getChild(1)); + return; + } + + if (!(subExpr.getChild(0) instanceof SlotRef)) { + throw new AnalysisException("Show filter by column"); + } + + String leftKey = ((SlotRef) subExpr.getChild(0)).getColumnName(); + if (subExpr instanceof BinaryPredicate) { + BinaryPredicate binaryPredicate = (BinaryPredicate) subExpr; + if (leftKey.equalsIgnoreCase(FILTER_PARTITION_NAME) || leftKey.equalsIgnoreCase(FILTER_STATE)) { + if (binaryPredicate.getOp() != BinaryPredicate.Operator.EQ) { + throw new AnalysisException(String.format("Only operator =|like are supported for %s", leftKey)); + } + } else if (leftKey.equalsIgnoreCase(FILTER_LAST_CONSISTENCY_CHECK_TIME)) { + if (!(subExpr.getChild(1) instanceof StringLiteral)) { + throw new AnalysisException("Where clause : LastConsistencyCheckTime =|>=|<=|>|<|!= " + + "\"2019-12-22|2019-12-22 22:22:00\""); + } + subExpr.setChild(1,(subExpr.getChild(1)).castTo(Type.DATETIME)); + } else if (!leftKey.equalsIgnoreCase(FILTER_PARTITION_ID) && !leftKey.equalsIgnoreCase(FILTER_BUCKETS) && + !leftKey.equalsIgnoreCase(FILTER_REPLICATION_NUM)) { + throw new AnalysisException("Only the columns of PartitionId/PartitionName/" + + "State/Buckets/ReplicationNum/LastConsistencyCheckTime are supported."); + } + } else if (subExpr instanceof LikePredicate) { + LikePredicate likePredicate = (LikePredicate) subExpr; + if (leftKey.equalsIgnoreCase(FILTER_PARTITION_NAME) || leftKey.equalsIgnoreCase(FILTER_STATE)) { + if (likePredicate.getOp() != LikePredicate.Operator.LIKE) { + throw new AnalysisException("Where clause : PartitionName|State like " + + "\"p20191012|NORMAL\""); + } + } else { + throw new AnalysisException("Where clause : PartitionName|State like \"p20191012|NORMAL\""); + } + } else { + throw new AnalysisException("Only operator =|>=|<=|>|<|!=|like are supported."); + } + filterMap.put(leftKey.toLowerCase(), subExpr); + } + + @Override public ShowResultSetMetaData getMetaData() { ShowResultSetMetaData.Builder builder = ShowResultSetMetaData.builder(); @@ -139,4 +258,37 @@ public ShowResultSetMetaData getMetaData() { return builder.build(); } + @Override + public String toSql() { + StringBuilder sb = new StringBuilder(); + sb.append("SHOW PARTITIONS FROM "); + if (!Strings.isNullOrEmpty(dbName)) { + sb.append("`").append(dbName).append("`"); + } + if (!Strings.isNullOrEmpty(tableName)) { + sb.append(".`").append(tableName).append("`"); + } + if (whereClause != null) { + sb.append(" WHERE ").append(whereClause.toSql()); + } + // Order By clause + if (orderByElements != null) { + sb.append(" ORDER BY "); + for (int i = 0; i < orderByElements.size(); ++i) { + sb.append(orderByElements.get(i).toSql()); + sb.append((i + 1 != orderByElements.size()) ? ", " : ""); + } + } + + if (limitElement != null) { + sb.append(limitElement.toSql()); + } + return sb.toString(); + } + + @Override + public String toString() { + return toSql(); + } + } diff --git a/fe/src/main/java/org/apache/doris/backup/RestoreJob.java b/fe/src/main/java/org/apache/doris/backup/RestoreJob.java index bb0dc34e292a1c..02e67d822b90f2 100644 --- a/fe/src/main/java/org/apache/doris/backup/RestoreJob.java +++ b/fe/src/main/java/org/apache/doris/backup/RestoreJob.java @@ -631,7 +631,7 @@ private void checkAndPrepareMeta() { schemaHash, restoreReplica.getVersion(), restoreReplica.getVersionHash(), keysType, TStorageType.COLUMN, TStorageMedium.HDD /* all restored replicas will be saved to HDD */, - columns, bfColumns, bfFpp, null); + columns, bfColumns, bfFpp, null, localTbl.getCopiedIndexes()); task.setInRestoreMode(true); batchTask.addTask(task); } @@ -662,7 +662,7 @@ private void checkAndPrepareMeta() { restoreTbl.getId(), restorePart.getId(), index.getId(), tablet.getId(), shortKeyColumnCount, schemaHash, replica.getVersion(), replica.getVersionHash(), keysType, TStorageType.COLUMN, TStorageMedium.HDD, columns, - bfColumns, bfFpp, null); + bfColumns, bfFpp, null, restoreTbl.getCopiedIndexes()); task.setInRestoreMode(true); batchTask.addTask(task); } diff --git a/fe/src/main/java/org/apache/doris/catalog/Catalog.java b/fe/src/main/java/org/apache/doris/catalog/Catalog.java index 12b4abc62aca84..704193d7c2733f 100644 --- a/fe/src/main/java/org/apache/doris/catalog/Catalog.java +++ b/fe/src/main/java/org/apache/doris/catalog/Catalog.java @@ -85,6 +85,7 @@ import org.apache.doris.catalog.Replica.ReplicaState; import org.apache.doris.catalog.Table.TableType; import org.apache.doris.clone.ColocateTableBalancer; +import org.apache.doris.clone.DynamicPartitionScheduler; import org.apache.doris.clone.TabletChecker; import org.apache.doris.clone.TabletScheduler; import org.apache.doris.clone.TabletSchedulerStat; @@ -104,12 +105,14 @@ import org.apache.doris.common.UserException; import org.apache.doris.common.io.Text; import org.apache.doris.common.util.Daemon; +import org.apache.doris.common.util.DynamicPartitionUtil; import org.apache.doris.common.util.KuduUtil; import org.apache.doris.common.util.MasterDaemon; import org.apache.doris.common.util.PrintableMap; import org.apache.doris.common.util.PropertyAnalyzer; import org.apache.doris.common.util.QueryableReentrantLock; import org.apache.doris.common.util.SmallFileMgr; +import org.apache.doris.common.util.TimeUtils; import org.apache.doris.common.util.Util; import org.apache.doris.consistency.ConsistencyChecker; import org.apache.doris.deploy.DeployManager; @@ -154,6 +157,7 @@ import org.apache.doris.persist.DropInfo; import org.apache.doris.persist.DropLinkDbAndUpdateDbInfo; import org.apache.doris.persist.DropPartitionInfo; +import org.apache.doris.persist.ModifyDynamicPartitionInfo; import org.apache.doris.persist.EditLog; import org.apache.doris.persist.ModifyPartitionInfo; import org.apache.doris.persist.PartitionPersistInfo; @@ -201,6 +205,7 @@ import com.sleepycat.je.rep.NetworkRestore; import com.sleepycat.je.rep.NetworkRestoreConfig; +import org.apache.commons.collections.CollectionUtils; import org.apache.kudu.ColumnSchema; import org.apache.kudu.Schema; import org.apache.kudu.client.CreateTableOptions; @@ -369,6 +374,8 @@ public class Catalog { private SmallFileMgr smallFileMgr; + private DynamicPartitionScheduler dynamicPartitionScheduler; + public List getFrontends(FrontendNodeType nodeType) { if (nodeType == null) { // get all @@ -417,6 +424,10 @@ public MetaReplayState getMetaReplayState() { return metaReplayState; } + public DynamicPartitionScheduler getDynamicPartitionScheduler() { + return this.dynamicPartitionScheduler; + } + private static class SingletonHolder { private static final Catalog INSTANCE = new Catalog(); } @@ -490,6 +501,9 @@ private Catalog() { this.routineLoadTaskScheduler = new RoutineLoadTaskScheduler(routineLoadManager); this.smallFileMgr = new SmallFileMgr(); + + this.dynamicPartitionScheduler = new DynamicPartitionScheduler("DynamicPartitionScheduler", + Config.dynamic_partition_check_interval_seconds * 1000L); } public static void destroyCheckpoint() { @@ -1177,6 +1191,8 @@ private void startMasterOnlyDaemonThreads() { // start routine load scheduler routineLoadScheduler.start(); routineLoadTaskScheduler.start(); + // start dynamic partition task + dynamicPartitionScheduler.start(); } // start threads that should running on all FE @@ -2975,7 +2991,7 @@ public void addPartition(Database db, String tableName, AddPartitionClause addPa dataProperty.getStorageMedium(), singlePartitionDesc.getReplicationNum(), versionInfo, bfColumns, olapTable.getBfFpp(), - tabletIdSet); + tabletIdSet, olapTable.getCopiedIndexes()); // check again db.writeLock(); @@ -3089,6 +3105,7 @@ public void replayAddPartition(PartitionPersistInfo info) throws DdlException { } public void dropPartition(Database db, OlapTable olapTable, DropPartitionClause clause) throws DdlException { + DynamicPartitionUtil.checkAlterAllowed(olapTable); Preconditions.checkArgument(db.isWriteLockHeldByCurrentThread()); String partitionName = clause.getPartitionName(); @@ -3254,7 +3271,8 @@ private Partition createPartitionWithIndices(String clusterName, long dbId, long Pair versionInfo, Set bfColumns, double bfFpp, - Set tabletIdSet) throws DdlException { + Set tabletIdSet, + List indexes) throws DdlException { // create base index first. Preconditions.checkArgument(baseIndexId != -1); MaterializedIndex baseIndex = new MaterializedIndex(baseIndexId, IndexState.NORMAL); @@ -3315,7 +3333,7 @@ private Partition createPartitionWithIndices(String clusterName, long dbId, long keysType, storageType, storageMedium, schema, bfColumns, bfFpp, - countDownLatch); + countDownLatch, indexes); batchTask.addTask(task); // add to AgentTaskQueue for handling finish report. // not for resending task @@ -3385,6 +3403,9 @@ private void createOlapTable(Database db, CreateTableStmt stmt) throws DdlExcept } partitionInfo = partitionDesc.toPartitionInfo(baseSchema, partitionNameToId); } else { + if (DynamicPartitionUtil.checkDynamicPartitionPropertiesExist(stmt.getProperties())) { + throw new DdlException("Only support dynamic partition properties on range partition table"); + } long partitionId = getNextId(); // use table name as single partition name partitionNameToId.put(tableName, partitionId); @@ -3405,10 +3426,13 @@ private void createOlapTable(Database db, CreateTableStmt stmt) throws DdlExcept short shortKeyColumnCount = Catalog.calcShortKeyColumnCount(baseSchema, stmt.getProperties()); LOG.debug("create table[{}] short key column count: {}", tableName, shortKeyColumnCount); + // indexes + TableIndexes indexes = new TableIndexes(stmt.getIndexes()); + // create table long tableId = Catalog.getInstance().getNextId(); OlapTable olapTable = new OlapTable(tableId, tableName, baseSchema, keysType, partitionInfo, - distributionInfo); + distributionInfo, indexes); olapTable.setComment(stmt.getComment()); // set base index id @@ -3542,7 +3566,7 @@ private void createOlapTable(Database db, CreateTableStmt stmt) throws DdlExcept partitionInfo.getDataProperty(partitionId).getStorageMedium(), partitionInfo.getReplicationNum(partitionId), versionInfo, bfColumns, bfFpp, - tabletIdSet); + tabletIdSet, olapTable.getCopiedIndexes()); olapTable.addPartition(partition); } else if (partitionInfo.getType() == PartitionType.RANGE) { try { @@ -3551,6 +3575,8 @@ private void createOlapTable(Database db, CreateTableStmt stmt) throws DdlExcept PropertyAnalyzer.analyzeDataProperty(stmt.getProperties(), DataProperty.DEFAULT_HDD_DATA_PROPERTY); PropertyAnalyzer.analyzeReplicationNum(properties, FeConstants.default_replication_num); + DynamicPartitionUtil.checkAndSetDynamicPartitionProperty(olapTable, properties); + if (properties != null && !properties.isEmpty()) { // here, all properties should be checked throw new DdlException("Unknown properties: " + properties); @@ -3573,7 +3599,7 @@ private void createOlapTable(Database db, CreateTableStmt stmt) throws DdlExcept dataProperty.getStorageMedium(), partitionInfo.getReplicationNum(entry.getValue()), versionInfo, bfColumns, bfFpp, - tabletIdSet); + tabletIdSet, olapTable.getCopiedIndexes()); olapTable.addPartition(partition); } } else { @@ -3583,7 +3609,7 @@ private void createOlapTable(Database db, CreateTableStmt stmt) throws DdlExcept if (!db.createTableWithLock(olapTable, false, stmt.isSetIfNotExists())) { ErrorReport.reportDdlException(ErrorCode.ERR_CANT_CREATE_TABLE, tableName, "table already exists"); } - + // we have added these index to memory, only need to persist here if (getColocateTableIndex().isColocateTable(tableId)) { GroupId groupId = getColocateTableIndex().getGroup(tableId); @@ -3591,8 +3617,11 @@ private void createOlapTable(Database db, CreateTableStmt stmt) throws DdlExcept ColocatePersistInfo info = ColocatePersistInfo.createForAddTable(groupId, tableId, backendsPerBucketSeq); editLog.logColocateAddTable(info); } - LOG.info("successfully create table[{};{}]", tableName, tableId); + // register or remove table from DynamicPartition after table created + DynamicPartitionUtil.registerOrRemoveDynamicPartitionTable(db.getId(), olapTable); + dynamicPartitionScheduler.createOrUpdateRuntimeInfo( + tableName, DynamicPartitionScheduler.LAST_UPDATE_TIME, TimeUtils.getCurrentFormatTime()); } catch (DdlException e) { for (Long tabletId : tabletIdSet) { Catalog.getCurrentInvertedIndex().deleteTablet(tabletId); @@ -3789,6 +3818,15 @@ public static void getDdlStmt(Table table, List createTableStmt, List properties) throws DdlException { + TableProperty tableProperty = table.getTableProperty(); + if (tableProperty == null) { + DynamicPartitionUtil.checkAndSetDynamicPartitionProperty(table, properties); + } else { + Map analyzedDynamicPartition = DynamicPartitionUtil.analyzeDynamicPartition(properties); + tableProperty.modifyTableProperties(analyzedDynamicPartition); + } + + DynamicPartitionUtil.registerOrRemoveDynamicPartitionTable(db.getId(), table); + dynamicPartitionScheduler.createOrUpdateRuntimeInfo( + table.getName(), DynamicPartitionScheduler.LAST_UPDATE_TIME, TimeUtils.getCurrentFormatTime()); + ModifyDynamicPartitionInfo info = new ModifyDynamicPartitionInfo(db.getId(), table.getId(), table.getTableProperty().getProperties()); + editLog.logDynamicPartition(info); + } + + public void replayModifyTableDynamicPartition(ModifyDynamicPartitionInfo info) { + long dbId = info.getDbId(); + long tableId = info.getTableId(); + Map properties = info.getProperties(); + + Database db = getDb(dbId); + db.writeLock(); + try { + OlapTable olapTable = (OlapTable) db.getTable(tableId); + TableProperty tableProperty = olapTable.getTableProperty(); + if (tableProperty == null) { + olapTable.setTableProperty(new TableProperty(properties).buildDynamicProperty()); + } else { + tableProperty.modifyTableProperties(properties); + } + } finally { + db.writeUnlock(); + } + } + /* * used for handling AlterClusterStmt * (for client is the ALTER CLUSTER command). @@ -5980,7 +6060,7 @@ public void truncateTable(TruncateTableStmt truncateTableStmt) throws DdlExcepti null /* version info */, copiedTbl.getCopiedBfColumns(), copiedTbl.getBfFpp(), - tabletIdSet); + tabletIdSet, copiedTbl.getCopiedIndexes()); newPartitions.add(newPartition); } } catch (DdlException e) { @@ -6179,7 +6259,7 @@ public void convertDistributionType(Database db, OlapTable tbl) throws DdlExcept throw new DdlException("Table " + tbl.getName() + " is not random distributed"); } TableInfo tableInfo = TableInfo.createForModifyDistribution(db.getId(), tbl.getId()); - editLog.logModifyDitrubutionType(tableInfo); + editLog.logModifyDistributionType(tableInfo); LOG.info("finished to modify distribution type of table: " + tbl.getName()); } finally { db.writeUnlock(); diff --git a/fe/src/main/java/org/apache/doris/catalog/DynamicPartitionProperty.java b/fe/src/main/java/org/apache/doris/catalog/DynamicPartitionProperty.java new file mode 100644 index 00000000000000..7906caeb5c8322 --- /dev/null +++ b/fe/src/main/java/org/apache/doris/catalog/DynamicPartitionProperty.java @@ -0,0 +1,82 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.catalog; + +import java.util.Map; + +public class DynamicPartitionProperty{ + public static final String TIME_UNIT = "dynamic_partition.time_unit"; + public static final String END = "dynamic_partition.end"; + public static final String PREFIX = "dynamic_partition.prefix"; + public static final String BUCKETS = "dynamic_partition.buckets"; + public static final String ENABLE = "dynamic_partition.enable"; + + private boolean exist; + + private boolean enable; + private String timeUnit; + private int end; + private String prefix; + private int buckets; + + DynamicPartitionProperty(Map properties) { + if (properties != null && !properties.isEmpty()) { + this.exist = true; + this.enable = Boolean.parseBoolean(properties.get(ENABLE)); + this.timeUnit = properties.get(TIME_UNIT); + this.end = Integer.parseInt(properties.get(END)); + this.prefix = properties.get(PREFIX); + this.buckets = Integer.parseInt(properties.get(BUCKETS)); + } else { + this.exist = false; + } + } + + public boolean isExist() { + return exist; + } + + public String getTimeUnit() { + return timeUnit; + } + + public int getEnd() { + return end; + } + + public String getPrefix() { + return prefix; + } + + public int getBuckets() { + return buckets; + } + + public boolean getEnable() { + return enable; + } + + @Override + public String toString() { + return ",\n\"" + ENABLE + "\" = \"" + enable + "\"" + + ",\n\"" + TIME_UNIT + "\" = \"" + timeUnit + "\"" + + ",\n\"" + END + "\" = \"" + end + "\"" + + ",\n\"" + PREFIX + "\" = \"" + prefix + "\"" + + ",\n\"" + BUCKETS + "\" = \"" + buckets + "\""; + } +} diff --git a/fe/src/main/java/org/apache/doris/catalog/FunctionSet.java b/fe/src/main/java/org/apache/doris/catalog/FunctionSet.java index a1a4a272fce2b4..c49cfaa0fe6f99 100644 --- a/fe/src/main/java/org/apache/doris/catalog/FunctionSet.java +++ b/fe/src/main/java/org/apache/doris/catalog/FunctionSet.java @@ -1000,7 +1000,7 @@ private void initAggregateBuiltins() { HLL_UNION_AGG_UPDATE_SYMBOL.get(t), "_ZN5doris12HllFunctions9hll_mergeEPN9doris_udf15FunctionContextERKNS1_9StringValEPS4_", "_ZN5doris12HllFunctions13hll_serializeEPN9doris_udf15FunctionContextERKNS1_9StringValE", - "_ZN5doris12HllFunctions12hll_finalizeEPN9doris_udf15FunctionContextERKNS1_9StringValE", + "_ZN5doris12HllFunctions13hll_get_valueEPN9doris_udf15FunctionContextERKNS1_9StringValE", null, "_ZN5doris12HllFunctions12hll_finalizeEPN9doris_udf15FunctionContextERKNS1_9StringValE", true, true, true)); diff --git a/fe/src/main/java/org/apache/doris/catalog/Index.java b/fe/src/main/java/org/apache/doris/catalog/Index.java new file mode 100644 index 00000000000000..d02915d985da04 --- /dev/null +++ b/fe/src/main/java/org/apache/doris/catalog/Index.java @@ -0,0 +1,152 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.catalog; + +import org.apache.doris.analysis.IndexDef; +import org.apache.doris.common.io.Text; +import org.apache.doris.common.io.Writable; +import org.apache.doris.persist.gson.GsonUtils; +import org.apache.doris.thrift.TIndexType; +import org.apache.doris.thrift.TOlapTableIndex; + +import com.google.gson.annotations.SerializedName; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.util.ArrayList; +import java.util.List; + +/** + * Internal representation of index, including index type, name, columns and comments. + * This class will used in olaptable + */ +public class Index implements Writable { + @SerializedName(value = "indexName") + private String indexName; + @SerializedName(value = "columns") + private List columns; + @SerializedName(value = "indexType") + private IndexDef.IndexType indexType; + @SerializedName(value = "comment") + private String comment; + + public Index(String indexName, List columns, IndexDef.IndexType indexType, String comment) { + this.indexName = indexName; + this.columns = columns; + this.indexType = indexType; + this.comment = comment; + } + + public Index() { + this.indexName = null; + this.columns = null; + this.indexType = null; + this.comment = null; + } + + public String getIndexName() { + return indexName; + } + + public void setIndexName(String indexName) { + this.indexName = indexName; + } + + public List getColumns() { + return columns; + } + + public void setColumns(List columns) { + this.columns = columns; + } + + public IndexDef.IndexType getIndexType() { + return indexType; + } + + public void setIndexType(IndexDef.IndexType indexType) { + this.indexType = indexType; + } + + public String getComment() { + return comment; + } + + public void setComment(String comment) { + this.comment = comment; + } + + @Override + public void write(DataOutput out) throws IOException { + Text.writeString(out, GsonUtils.GSON.toJson(this)); + } + + public static Index read(DataInput in) throws IOException { + String json = Text.readString(in); + return GsonUtils.GSON.fromJson(json, Index.class); + } + + @Override + public int hashCode() { + return 31 * (indexName.hashCode() + columns.hashCode() + indexType.hashCode()); + } + + public Index clone() { + return new Index(indexName, new ArrayList<>(columns), indexType, comment); + } + + @Override + public String toString() { + return toSql(); + } + + public String toSql() { + StringBuilder sb = new StringBuilder("INDEX "); + sb.append(indexName); + sb.append(" ("); + boolean first = true; + for (String col : columns) { + if (first) { + first = false; + } else { + sb.append(","); + } + sb.append("`" + col + "`"); + } + sb.append(")"); + if (indexType != null) { + sb.append(" USING ").append(indexType.toString()); + } + if (comment != null) { + sb.append(" COMMENT '" + comment + "'"); + } + return sb.toString(); + } + + public TOlapTableIndex toThrift() { + TOlapTableIndex tIndex = new TOlapTableIndex(); + tIndex.setIndex_name(indexName); + tIndex.setColumns(columns); + tIndex.setIndex_type(TIndexType.valueOf(indexType.toString())); + if (columns != null) { + tIndex.setComment(comment); + } + return tIndex; + } +} diff --git a/fe/src/main/java/org/apache/doris/catalog/OlapTable.java b/fe/src/main/java/org/apache/doris/catalog/OlapTable.java index df1c72f7286e0a..459d02325782ca 100644 --- a/fe/src/main/java/org/apache/doris/catalog/OlapTable.java +++ b/fe/src/main/java/org/apache/doris/catalog/OlapTable.java @@ -66,6 +66,7 @@ import java.util.LinkedList; import java.util.List; import java.util.Map; +import java.util.Optional; import java.util.Set; import java.util.zip.Adler32; @@ -111,6 +112,8 @@ public enum OlapTableState { private double bfFpp; private String colocateGroup; + + private TableIndexes indexes; // In former implementation, base index id is same as table id. // But when refactoring the process of alter table job, we find that @@ -120,6 +123,8 @@ public enum OlapTableState { // The init value is -1, which means there is not partition and index at all. private long baseIndexId = -1; + private TableProperty tableProperty; + public OlapTable() { // for persist super(TableType.OLAP); @@ -139,10 +144,19 @@ public OlapTable() { this.bfFpp = 0; this.colocateGroup = null; + + this.indexes = null; + + this.tableProperty = null; } - public OlapTable(long id, String tableName, List baseSchema, - KeysType keysType, PartitionInfo partitionInfo, DistributionInfo defaultDistributionInfo) { + public OlapTable(long id, String tableName, List baseSchema, KeysType keysType, + PartitionInfo partitionInfo, DistributionInfo defaultDistributionInfo) { + this(id, tableName, baseSchema, keysType, partitionInfo, defaultDistributionInfo, null); + } + + public OlapTable(long id, String tableName, List baseSchema, KeysType keysType, + PartitionInfo partitionInfo, DistributionInfo defaultDistributionInfo, TableIndexes indexes) { super(id, tableName, TableType.OLAP, baseSchema); this.state = OlapTableState.NORMAL; @@ -167,6 +181,28 @@ public OlapTable(long id, String tableName, List baseSchema, this.bfFpp = 0; this.colocateGroup = null; + + if (indexes == null) { + this.indexes = null; + } else { + this.indexes = indexes; + } + + this.tableProperty = null; + } + + public void setTableProperty(TableProperty tableProperty) { + this.tableProperty = tableProperty; + } + + public TableProperty getTableProperty() { + return this.tableProperty; + } + + public boolean dynamicPartitionExists() { + return tableProperty != null + && tableProperty.getDynamicPartitionProperty() != null + && tableProperty.getDynamicPartitionProperty().isExist(); } public void setBaseIndexId(long baseIndexId) { @@ -185,6 +221,26 @@ public OlapTableState getState() { return state; } + public List getIndexes() { + if (indexes == null) { + return Lists.newArrayList(); + } + return indexes.getIndexes(); + } + + public TableIndexes getTableIndexes() { + return indexes; + } + + public Map getIndexesMap() { + Map indexMap = new HashMap<>(); + if (indexes != null) { + Optional.ofNullable(indexes.getIndexes()).orElse(Collections.emptyList()).stream().forEach( + i -> indexMap.put(i.getIndexName(), i)); + } + return indexMap; + } + public void setName(String newName) { // change name in indexNameToId long baseIndexId = indexNameToId.remove(this.name); @@ -574,6 +630,13 @@ public Set getCopiedBfColumns() { return Sets.newHashSet(bfColumns); } + public List getCopiedIndexes() { + if (indexes == null) { + return Lists.newArrayList(); + } + return indexes.getCopiedIndexes(); + } + public double getBfFpp() { return bfFpp; } @@ -583,6 +646,13 @@ public void setBloomFilterInfo(Set bfColumns, double bfFpp) { this.bfFpp = bfFpp; } + public void setIndexes(List indexes) { + if (this.indexes == null) { + this.indexes = new TableIndexes(null); + } + this.indexes.setIndexes(indexes); + } + public String getColocateGroup() { return colocateGroup; } @@ -819,6 +889,22 @@ public void write(DataOutput out) throws IOException { } out.writeLong(baseIndexId); + + // write indexes + if (indexes != null) { + out.writeBoolean(true); + indexes.write(out); + } else { + out.writeBoolean(false); + } + + //dynamicProperties + if (tableProperty == null) { + out.writeBoolean(false); + } else { + out.writeBoolean(true); + tableProperty.write(out); + } } public void readFields(DataInput in) throws IOException { @@ -912,6 +998,19 @@ public void readFields(DataInput in) throws IOException { // the old table use table id as base index id baseIndexId = id; } + + // read indexes + if (Catalog.getCurrentCatalogJournalVersion() >= FeMetaVersion.VERSION_70) { + if (in.readBoolean()) { + this.indexes = TableIndexes.read(in); + } + } + // dynamic partition + if (Catalog.getCurrentCatalogJournalVersion() >= FeMetaVersion.VERSION_71) { + if (in.readBoolean()) { + tableProperty = TableProperty.read(in); + } + } } public boolean equals(Table table) { diff --git a/fe/src/main/java/org/apache/doris/catalog/TableIndexes.java b/fe/src/main/java/org/apache/doris/catalog/TableIndexes.java new file mode 100644 index 00000000000000..f8aef522a8197e --- /dev/null +++ b/fe/src/main/java/org/apache/doris/catalog/TableIndexes.java @@ -0,0 +1,100 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.catalog; + +import org.apache.doris.common.io.Text; +import org.apache.doris.common.io.Writable; +import org.apache.doris.persist.gson.GsonUtils; + +import com.google.common.collect.Lists; +import com.google.common.collect.Maps; +import com.google.gson.annotations.SerializedName; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.util.HashMap; +import java.util.List; +import java.util.Map; + +/** + * Internal representation of table index, including indexes and index properties for future features + */ +public class TableIndexes implements Writable { + @SerializedName(value = "indexes") + private List indexes; + @SerializedName(value = "properties") + private Map properties; + + public TableIndexes() { + this.indexes = Lists.newArrayList(); + this.properties = Maps.newHashMap(); + } + + public TableIndexes(List indexes) { + this.indexes = indexes; + this.properties = Maps.newHashMap(); + } + + public TableIndexes(List indexes, Map properties) { + this.indexes = indexes; + this.properties = properties; + } + + public List getIndexes() { + return indexes; + } + + public List getCopiedIndexes() { + if (indexes == null || indexes.size() == 0) { + return Lists.newArrayList(); + } else { + return Lists.newArrayList(indexes); + } + } + + public void setIndexes(List indexes) { + this.indexes = indexes; + } + + public Map getProperties() { + return properties; + } + + public Map getCopiedProperties() { + if (properties == null || properties.size() == 0) { + return new HashMap<>(); + } else { + return new HashMap<>(properties); + } + } + + public void setProperties(Map properties) { + this.properties = properties; + } + + @Override + public void write(DataOutput out) throws IOException { + Text.writeString(out, GsonUtils.GSON.toJson(this)); + } + + public static TableIndexes read(DataInput in) throws IOException { + String json = Text.readString(in); + return GsonUtils.GSON.fromJson(json, TableIndexes.class); + } +} diff --git a/fe/src/main/java/org/apache/doris/catalog/TableProperty.java b/fe/src/main/java/org/apache/doris/catalog/TableProperty.java new file mode 100644 index 00000000000000..ed9d9665551a70 --- /dev/null +++ b/fe/src/main/java/org/apache/doris/catalog/TableProperty.java @@ -0,0 +1,89 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.catalog; + +import com.google.gson.annotations.SerializedName; +import org.apache.doris.common.io.Text; +import org.apache.doris.common.io.Writable; +import org.apache.doris.persist.gson.GsonUtils; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +/** TableProperty contains additional information about OlapTable + * TableProperty includes properties to persistent the additional information + * Different properties is recognized by prefix such as dynamic_partition + * If there is different type properties is added.Write a method such as buildDynamicProperty to build it. + */ +public class TableProperty implements Writable { + public static final String DYNAMIC_PARTITION_PROPERTY_PREFIX = "dynamic_partition"; + + @SerializedName(value = "properties") + private Map properties; + + private DynamicPartitionProperty dynamicPartitionProperty; + + public TableProperty(Map properties) { + this.properties = properties; + } + + public static boolean isSamePrefixProperties(Map properties, String prefix) { + for (String value : properties.keySet()) { + if (!value.startsWith(prefix)) { + return false; + } + } + return true; + } + + public TableProperty buildDynamicProperty() { + HashMap dynamicPartitionProperties = new HashMap<>(); + for (Map.Entry entry : properties.entrySet()) { + if (entry.getKey().startsWith(DYNAMIC_PARTITION_PROPERTY_PREFIX)) { + dynamicPartitionProperties.put(entry.getKey(), entry.getValue()); + } + } + dynamicPartitionProperty = new DynamicPartitionProperty(dynamicPartitionProperties); + return this; + } + + void modifyTableProperties(Map modifyProperties) { + properties.putAll(modifyProperties); + buildDynamicProperty(); + } + + public Map getProperties() { + return properties; + } + + public DynamicPartitionProperty getDynamicPartitionProperty() { + return dynamicPartitionProperty; + } + + @Override + public void write(DataOutput out) throws IOException { + Text.writeString(out, GsonUtils.GSON.toJson(this)); + } + + public static TableProperty read(DataInput in) throws IOException { + return GsonUtils.GSON.fromJson(Text.readString(in), TableProperty.class).buildDynamicProperty(); + } +} diff --git a/fe/src/main/java/org/apache/doris/clone/DynamicPartitionScheduler.java b/fe/src/main/java/org/apache/doris/clone/DynamicPartitionScheduler.java new file mode 100644 index 00000000000000..19b87d39f95188 --- /dev/null +++ b/fe/src/main/java/org/apache/doris/clone/DynamicPartitionScheduler.java @@ -0,0 +1,292 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.clone; + +import com.google.common.collect.Maps; +import com.google.common.collect.Range; +import com.google.common.collect.Sets; +import org.apache.doris.analysis.AddPartitionClause; +import org.apache.doris.analysis.DistributionDesc; +import org.apache.doris.analysis.HashDistributionDesc; +import org.apache.doris.analysis.PartitionKeyDesc; +import org.apache.doris.analysis.PartitionValue; +import org.apache.doris.analysis.SingleRangePartitionDesc; +import org.apache.doris.catalog.Catalog; +import org.apache.doris.catalog.Column; +import org.apache.doris.catalog.Database; +import org.apache.doris.catalog.DynamicPartitionProperty; +import org.apache.doris.catalog.HashDistributionInfo; +import org.apache.doris.catalog.OlapTable; +import org.apache.doris.catalog.PartitionInfo; +import org.apache.doris.catalog.PartitionKey; +import org.apache.doris.catalog.RangePartitionInfo; +import org.apache.doris.catalog.Table; +import org.apache.doris.catalog.TableProperty; +import org.apache.doris.common.AnalysisException; +import org.apache.doris.common.Config; +import org.apache.doris.common.DdlException; +import org.apache.doris.common.Pair; +import org.apache.doris.common.util.DynamicPartitionUtil; +import org.apache.doris.common.util.MasterDaemon; +import org.apache.doris.common.util.TimeUtils; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; + +import java.util.ArrayList; +import java.util.Calendar; +import java.util.Collections; +import java.util.HashMap; +import java.util.Iterator; +import java.util.List; +import java.util.Map; +import java.util.Set; + +/** + * This class is used to periodically add or drop partition on an olapTable which specify dynamic partition properties + * Config.dynamic_partition_enable determine whether this feature is enable, Config.dynamic_partition_check_interval_seconds + * determine how often the task is performed + */ +public class DynamicPartitionScheduler extends MasterDaemon { + private static final Logger LOG = LogManager.getLogger(DynamicPartitionScheduler.class); + public static final String LAST_SCHEDULER_TIME = "lastSchedulerTime"; + public static final String LAST_UPDATE_TIME = "lastUpdateTime"; + public static final String DYNAMIC_PARTITION_STATE = "dynamicPartitionState"; + public static final String MSG = "msg"; + + private final String DEFAULT_RUNTIME_VALUE = "N/A"; + + private Map> runtimeInfos = Maps.newConcurrentMap(); + private Set> dynamicPartitionTableInfo = Sets.newConcurrentHashSet(); + private boolean initialize; + + public enum State { + NORMAL, + ERROR + } + + + public DynamicPartitionScheduler(String name, long intervalMs) { + super(name, intervalMs); + this.initialize = false; + } + + public void registerDynamicPartitionTable(Long dbId, Long tableId) { + dynamicPartitionTableInfo.add(new Pair<>(dbId, tableId)); + } + + public void removeDynamicPartitionTable(Long dbId, Long tableId) { + dynamicPartitionTableInfo.remove(new Pair<>(dbId, tableId)); + } + + public String getRuntimeInfo(String tableName, String key) { + Map tableRuntimeInfo = runtimeInfos.getOrDefault(tableName, createDefaultRuntimeInfo()); + return tableRuntimeInfo.getOrDefault(key, DEFAULT_RUNTIME_VALUE); + } + + public void removeRuntimeInfo(String tableName) { + runtimeInfos.remove(tableName); + } + + public void createOrUpdateRuntimeInfo(String tableName, String key, String value) { + Map runtimeInfo = runtimeInfos.get(tableName); + if (runtimeInfo == null) { + runtimeInfo = createDefaultRuntimeInfo(); + runtimeInfo.put(key, value); + runtimeInfos.put(tableName, runtimeInfo); + } else { + runtimeInfo.put(key, value); + } + } + + private Map createDefaultRuntimeInfo() { + Map defaultRuntimeInfo = Maps.newConcurrentMap(); + defaultRuntimeInfo.put(LAST_UPDATE_TIME, DEFAULT_RUNTIME_VALUE); + defaultRuntimeInfo.put(LAST_SCHEDULER_TIME, DEFAULT_RUNTIME_VALUE); + defaultRuntimeInfo.put(DYNAMIC_PARTITION_STATE, State.NORMAL.toString()); + defaultRuntimeInfo.put(MSG, DEFAULT_RUNTIME_VALUE); + return defaultRuntimeInfo; + } + + private void dynamicAddPartition() { + Iterator> iterator = dynamicPartitionTableInfo.iterator(); + while (iterator.hasNext()) { + Pair tableInfo = iterator.next(); + Long dbId = tableInfo.first; + Long tableId = tableInfo.second; + Database db = Catalog.getInstance().getDb(dbId); + if (db == null) { + iterator.remove(); + continue; + } + String tableName; + ArrayList addPartitionClauses = new ArrayList<>(); + db.readLock(); + try { + // Only OlapTable has DynamicPartitionProperty + OlapTable olapTable = (OlapTable) db.getTable(tableId); + if (olapTable == null + || !olapTable.dynamicPartitionExists() + || !olapTable.getTableProperty().getDynamicPartitionProperty().getEnable()) { + iterator.remove(); + continue; + } + + if (olapTable.getState() != OlapTable.OlapTableState.NORMAL) { + String errorMsg = "Table[" + olapTable.getName() + "]'s state is not NORMAL." + + "Do not allow doing dynamic add partition. table state=" + olapTable.getState(); + recordFailedMsg(olapTable.getName(), errorMsg); + LOG.info(errorMsg); + continue; + } + + // Determine the partition column type + // if column type is Date, format partition name as yyyyMMdd + // if column type is DateTime, format partition name as yyyyMMddHHssmm + // scheduler time should be record even no partition added + createOrUpdateRuntimeInfo(olapTable.getName(), LAST_SCHEDULER_TIME, TimeUtils.getCurrentFormatTime()); + RangePartitionInfo rangePartitionInfo = (RangePartitionInfo) olapTable.getPartitionInfo(); + Column partitionColumn = rangePartitionInfo.getPartitionColumns().get(0); + String partitionFormat; + try { + partitionFormat = DynamicPartitionUtil.getPartitionFormat(partitionColumn); + } catch (DdlException e) { + recordFailedMsg(olapTable.getName(), e.getMessage()); + continue; + } + + Calendar calendar = Calendar.getInstance(); + TableProperty tableProperty = olapTable.getTableProperty(); + DynamicPartitionProperty dynamicPartitionProperty = tableProperty.getDynamicPartitionProperty(); + + for (int i = 0; i <= dynamicPartitionProperty.getEnd(); i++) { + String dynamicPartitionPrefix = dynamicPartitionProperty.getPrefix(); + String prevBorder = DynamicPartitionUtil.getPartitionRange(dynamicPartitionProperty.getTimeUnit(), + i, (Calendar) calendar.clone(), partitionFormat); + String partitionName = dynamicPartitionPrefix + DynamicPartitionUtil.getFormattedPartitionName(prevBorder); + + // continue if partition already exists + String nextBorder = DynamicPartitionUtil.getPartitionRange(dynamicPartitionProperty.getTimeUnit(), + i + 1, (Calendar) calendar.clone(), partitionFormat); + PartitionValue lowerValue = new PartitionValue(prevBorder); + PartitionValue upperValue = new PartitionValue(nextBorder); + PartitionInfo partitionInfo = olapTable.getPartitionInfo(); + RangePartitionInfo info = (RangePartitionInfo) (partitionInfo); + boolean isPartitionExists = false; + Range addPartitionKeyRange = null; + try { + PartitionKey lowerBound = PartitionKey.createPartitionKey(Collections.singletonList(lowerValue), Collections.singletonList(partitionColumn)); + PartitionKey upperBound = PartitionKey.createPartitionKey(Collections.singletonList(upperValue), Collections.singletonList(partitionColumn)); + addPartitionKeyRange = Range.closedOpen(lowerBound, upperBound); + } catch (AnalysisException e) { + // keys.size is always equal to column.size, cannot reach this exception + LOG.error("Keys size is not equl to column size."); + continue; + } + for (Range partitionKeyRange : info.getIdToRange().values()) { + // only support single column partition now + try { + RangePartitionInfo.checkRangeIntersect(partitionKeyRange, addPartitionKeyRange); + } catch (DdlException e) { + isPartitionExists = true; + if (addPartitionKeyRange.equals(partitionKeyRange)) { + clearFailedMsg(olapTable.getName()); + } else { + recordFailedMsg(olapTable.getName(), e.getMessage()); + } + break; + } + } + if (isPartitionExists) { + continue; + } + + // construct partition desc + PartitionKeyDesc partitionKeyDesc = new PartitionKeyDesc(Collections.singletonList(lowerValue), Collections.singletonList(upperValue)); + HashMap partitionProperties = new HashMap<>(1); + partitionProperties.put("replication_num", String.valueOf(DynamicPartitionUtil.estimateReplicateNum(olapTable))); + SingleRangePartitionDesc rangePartitionDesc = new SingleRangePartitionDesc(true, partitionName, + partitionKeyDesc, partitionProperties); + + // construct distribution desc + HashDistributionInfo hashDistributionInfo = (HashDistributionInfo) olapTable.getDefaultDistributionInfo(); + List distColumnNames = new ArrayList<>(); + for (Column distributionColumn : hashDistributionInfo.getDistributionColumns()) { + distColumnNames.add(distributionColumn.getName()); + } + DistributionDesc distributionDesc = new HashDistributionDesc(dynamicPartitionProperty.getBuckets(), distColumnNames); + + // add partition according to partition desc and distribution desc + addPartitionClauses.add(new AddPartitionClause(rangePartitionDesc, distributionDesc, null)); + } + tableName = olapTable.getName(); + } finally { + db.readUnlock(); + } + for (AddPartitionClause addPartitionClause : addPartitionClauses) { + try { + Catalog.getCurrentCatalog().addPartition(db, tableName, addPartitionClause); + clearFailedMsg(tableName); + } catch (DdlException e) { + recordFailedMsg(tableName, e.getMessage()); + } + } + } + } + + private void recordFailedMsg(String tableName, String msg) { + LOG.warn("dynamic add partition failed: " + msg); + createOrUpdateRuntimeInfo(tableName, DYNAMIC_PARTITION_STATE, State.ERROR.toString()); + createOrUpdateRuntimeInfo(tableName, MSG, msg); + } + + private void clearFailedMsg(String tableName) { + createOrUpdateRuntimeInfo(tableName, DYNAMIC_PARTITION_STATE, State.NORMAL.toString()); + createOrUpdateRuntimeInfo(tableName, MSG, DEFAULT_RUNTIME_VALUE); + } + + private void initDynamicPartitionTable() { + for (Long dbId : Catalog.getInstance().getDbIds()) { + Database db = Catalog.getInstance().getDb(dbId); + if (db == null) { + continue; + } + db.readLock(); + try { + for (Table table : Catalog.getInstance().getDb(dbId).getTables()) { + if (DynamicPartitionUtil.isDynamicPartitionTable(table)) { + registerDynamicPartitionTable(db.getId(), table.getId()); + } + } + } finally { + db.readUnlock(); + } + } + initialize = true; + } + + @Override + protected void runAfterCatalogReady() { + if (!initialize) { + // check Dynamic Partition tables only when FE start + initDynamicPartitionTable(); + } + if (Config.dynamic_partition_enable) { + dynamicAddPartition(); + } + } +} \ No newline at end of file diff --git a/fe/src/main/java/org/apache/doris/common/Config.java b/fe/src/main/java/org/apache/doris/common/Config.java index 753c5de3ba7ac8..44fa9eb555eefb 100644 --- a/fe/src/main/java/org/apache/doris/common/Config.java +++ b/fe/src/main/java/org/apache/doris/common/Config.java @@ -266,6 +266,15 @@ public class Config extends ConfigBase { */ @ConfField public static int query_port = 9030; + /* + * mysql service nio option. + */ + @ConfField public static boolean mysql_service_nio_enabled = false; + + /* + * num of thread to handle io events in mysql. + */ + @ConfField public static int mysql_service_io_threads_num = 4; /* * Cluster name will be shown as the title of web page */ @@ -945,5 +954,17 @@ public class Config extends ConfigBase { */ @ConfField(mutable = true) public static boolean disable_cluster_feature = true; + + /* + * Decide how often to check dynamic partition + */ + @ConfField(mutable = true, masterOnly = true) + public static int dynamic_partition_check_interval_seconds = 600; + + /* + * If set to true, dynamic partition feature will open + */ + @ConfField(mutable = true, masterOnly = true) + public static boolean dynamic_partition_enable = false; } diff --git a/fe/src/main/java/org/apache/doris/common/ErrorCode.java b/fe/src/main/java/org/apache/doris/common/ErrorCode.java index 9b2ab9f9f42e44..21a72048b89144 100644 --- a/fe/src/main/java/org/apache/doris/common/ErrorCode.java +++ b/fe/src/main/java/org/apache/doris/common/ErrorCode.java @@ -214,7 +214,25 @@ public enum ErrorCode { "Colocate tables distribution columns must have the same data type: %s should be %s"), ERR_COLOCATE_NOT_COLOCATE_TABLE(5064, new byte[] { '4', '2', '0', '0', '0' }, "Table %s is not a colocated table"), - ERR_INVALID_OPERATION(5065, new byte[] { '4', '2', '0', '0', '0' }, "Operation %s is invalid"); + ERR_INVALID_OPERATION(5065, new byte[] { '4', '2', '0', '0', '0' }, "Operation %s is invalid"), + ERROR_DYNAMIC_PARTITION_TIME_UNIT(5065, new byte[] {'4', '2', '0', '0', '0'}, + "Unsupported time unit %s. Expect DAY WEEK MONTH."), + ERROR_DYNAMIC_PARTITION_END_ZERO(5066, new byte[] {'4', '2', '0', '0', '0'}, + "Dynamic partition end must greater than 0"), + ERROR_DYNAMIC_PARTITION_END_FORMAT(5066, new byte[] {'4', '2', '0', '0', '0'}, + "Invalid dynamic partition end %s"), + ERROR_DYNAMIC_PARTITION_END_EMPTY(5066, new byte[] {'4', '2', '0', '0', '0'}, + "Dynamic partition end is empty"), + ERROR_DYNAMIC_PARTITION_BUCKETS_ZERO(5067, new byte[] {'4', '2', '0', '0', '0'}, + "Dynamic partition buckets must greater than 0"), + ERROR_DYNAMIC_PARTITION_BUCKETS_FORMAT(5067, new byte[] {'4', '2', '0', '0', '0'}, + "Invalid dynamic partition buckets %s"), + ERROR_DYNAMIC_PARTITION_BUCKETS_EMPTY(5066, new byte[] {'4', '2', '0', '0', '0'}, + "Dynamic partition buckets is empty"), + ERROR_DYNAMIC_PARTITION_ENABLE(5068, new byte[] {'4', '2', '0', '0', '0'}, + "Invalid dynamic partition enable: %s. Expected true or false"), + ERROR_DYNAMIC_PARTITION_PREFIX(5069, new byte[] {'4', '2', '0', '0', '0'}, + "Invalid dynamic partition prefix: %s."); ErrorCode(int code, byte[] sqlState, String errorMsg) { this.code = code; diff --git a/fe/src/main/java/org/apache/doris/common/FeMetaVersion.java b/fe/src/main/java/org/apache/doris/common/FeMetaVersion.java index 672c058575d605..2a9539ccd6d1a0 100644 --- a/fe/src/main/java/org/apache/doris/common/FeMetaVersion.java +++ b/fe/src/main/java/org/apache/doris/common/FeMetaVersion.java @@ -147,8 +147,12 @@ public final class FeMetaVersion { public static final int VERSION_67 = 67; // for es table context public static final int VERSION_68 = 68; - // modofy password checking logic + // modify password checking logic public static final int VERSION_69 = 69; + // for indexes + public static final int VERSION_70 = 70; + // dynamic partition + public static final int VERSION_71 = 71; // note: when increment meta version, should assign the latest version to VERSION_CURRENT - public static final int VERSION_CURRENT = VERSION_69; + public static final int VERSION_CURRENT = VERSION_71; } diff --git a/fe/src/main/java/org/apache/doris/common/proc/PartitionsProcDir.java b/fe/src/main/java/org/apache/doris/common/proc/PartitionsProcDir.java index 030e803abcd0d6..8b4faa22417fc9 100644 --- a/fe/src/main/java/org/apache/doris/common/proc/PartitionsProcDir.java +++ b/fe/src/main/java/org/apache/doris/common/proc/PartitionsProcDir.java @@ -20,8 +20,16 @@ import com.google.common.base.Joiner; import com.google.common.base.Preconditions; import com.google.common.collect.ImmutableList; -import com.google.common.collect.Range; +import com.google.common.collect.Lists; + +import com.google.common.collect.Range; +import org.apache.doris.analysis.BinaryPredicate; +import org.apache.doris.analysis.DateLiteral; +import org.apache.doris.analysis.Expr; +import org.apache.doris.analysis.IntLiteral; +import org.apache.doris.analysis.LimitElement; +import org.apache.doris.analysis.StringLiteral; import org.apache.doris.catalog.Column; import org.apache.doris.catalog.DataProperty; import org.apache.doris.catalog.Database; @@ -34,15 +42,24 @@ import org.apache.doris.catalog.PartitionType; import org.apache.doris.catalog.RangePartitionInfo; import org.apache.doris.catalog.Table.TableType; +import org.apache.doris.catalog.Type; import org.apache.doris.common.AnalysisException; +import org.apache.doris.common.ErrorCode; +import org.apache.doris.common.ErrorReport; import org.apache.doris.common.Pair; import org.apache.doris.common.util.DebugUtil; +import org.apache.doris.common.util.ListComparator; +import org.apache.doris.common.util.OrderByPair; import org.apache.doris.common.util.TimeUtils; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; import java.util.ArrayList; +import java.util.Collections; import java.util.List; import java.util.Map; + /* * SHOW PROC /dbs/dbId/tableId/partitions * show partitions' detail info within a table @@ -55,6 +72,8 @@ public class PartitionsProcDir implements ProcDirInterface { .add("LastConsistencyCheckTime").add("DataSize") .build(); + private static final Logger LOG = LogManager.getLogger(PartitionsProcDir.class); + public static final int PARTITION_NAME_INDEX = 1; private Database db; @@ -65,8 +84,124 @@ public PartitionsProcDir(Database db, OlapTable olapTable) { this.olapTable = olapTable; } - @Override - public ProcResult fetchResult() throws AnalysisException { + public boolean filter(String columnName, Comparable element, Map filterMap) throws AnalysisException { + if (filterMap == null) { + return true; + } + Expr subExpr = filterMap.get(columnName.toLowerCase()); + if (subExpr == null) { + return true; + } + if (subExpr instanceof BinaryPredicate) { + BinaryPredicate binaryPredicate = (BinaryPredicate) subExpr; + if (subExpr.getChild(1) instanceof StringLiteral && binaryPredicate.getOp() == BinaryPredicate.Operator.EQ) { + return ((StringLiteral) subExpr.getChild(1)).getValue().equals(element); + } + long leftVal; + long rightVal; + if (subExpr.getChild(1) instanceof DateLiteral) { + leftVal = (new DateLiteral((String) element, Type.DATETIME)).getLongValue(); + rightVal = ((DateLiteral) subExpr.getChild(1)).getLongValue(); + } else { + leftVal = Long.parseLong(element.toString()); + rightVal = ((IntLiteral)subExpr.getChild(1)).getLongValue(); + } + switch (binaryPredicate.getOp()) { + case EQ: + case EQ_FOR_NULL: + return leftVal == rightVal; + case GE: + return leftVal >= rightVal; + case GT: + return leftVal > rightVal; + case LE: + return leftVal <= rightVal; + case LT: + return leftVal < rightVal; + case NE: + return leftVal != rightVal; + default: + Preconditions.checkState(false, "No defined binary operator."); + } + } else { + return like((String)element, ((StringLiteral) subExpr.getChild(1)).getValue()); + } + return true; + } + + public boolean like(String str, String expr) { + expr = expr.toLowerCase(); + expr = expr.replace(".", "\\."); + expr = expr.replace("?", "."); + expr = expr.replace("%", ".*"); + str = str.toLowerCase(); + return str.matches(expr); + } + + public ProcResult fetchResultByFilter(Map filterMap, List orderByPairs, LimitElement limitElement) throws AnalysisException { + List> partitionInfos = getPartitionInfos(); + List> filterPartitionInfos = null; + //where + if (filterMap == null || filterMap.isEmpty()) { + filterPartitionInfos = partitionInfos; + } else { + filterPartitionInfos = Lists.newArrayList(); + for (List partitionInfo : partitionInfos) { + if (partitionInfo.size() != TITLE_NAMES.size()) { + throw new AnalysisException("ParttitionInfos.size() " + partitionInfos.size() + + " not equal TITLE_NAMES.size() " + TITLE_NAMES.size()); + } + boolean isNeed = true; + for (int i = 0; i < partitionInfo.size(); i++) { + isNeed = filter(TITLE_NAMES.get(i), partitionInfo.get(i), filterMap); + if (!isNeed) { + break; + } + } + + if (isNeed) { + filterPartitionInfos.add(partitionInfo); + } + } + } + + // order by + if (orderByPairs != null) { + ListComparator> comparator = null; + OrderByPair[] orderByPairArr = new OrderByPair[orderByPairs.size()]; + comparator = new ListComparator<>(orderByPairs.toArray(orderByPairArr)); + Collections.sort(filterPartitionInfos, comparator); + } + + //limit + if (limitElement != null && limitElement.hasLimit()) { + int beginIndex = (int) limitElement.getOffset(); + int endIndex = (int) (beginIndex + limitElement.getLimit()); + if (endIndex > filterPartitionInfos.size()) { + endIndex = filterPartitionInfos.size(); + } + filterPartitionInfos = filterPartitionInfos.subList(beginIndex,endIndex); + } + + return getBasicProcResult(filterPartitionInfos); + } + + public BaseProcResult getBasicProcResult(List> partitionInfos) { + // set result + BaseProcResult result = new BaseProcResult(); + result.setNames(TITLE_NAMES); + for (List info : partitionInfos) { + List row = new ArrayList(info.size()); + for (Comparable comparable : info) { + row.add(comparable.toString()); + } + result.addRow(row); + } + + return result; + } + + public List> getPartitionInfos() { Preconditions.checkNotNull(db); Preconditions.checkNotNull(olapTable); Preconditions.checkState(olapTable.getType() == TableType.OLAP); @@ -119,10 +254,10 @@ public ProcResult fetchResult() throws AnalysisException { } partitionInfo.add(distributionInfo.getBucketNum()); - + short replicationNum = olapTable.getPartitionInfo().getReplicationNum(partitionId); partitionInfo.add(String.valueOf(replicationNum)); - + DataProperty dataProperty = rangePartitionInfo.getDataProperty(partitionId); partitionInfo.add(dataProperty.getStorageMedium().name()); partitionInfo.add(TimeUtils.longToTimeString(dataProperty.getCooldownTimeMs())); @@ -132,7 +267,7 @@ public ProcResult fetchResult() throws AnalysisException { long dataSize = partition.getDataSize(); Pair sizePair = DebugUtil.getByteUint(dataSize); String readableSize = DebugUtil.DECIMAL_FORMAT_SCALE_3.format(sizePair.first) + " " - + sizePair.second; + + sizePair.second; partitionInfo.add(readableSize); partitionInfos.add(partitionInfo); @@ -183,7 +318,7 @@ public ProcResult fetchResult() throws AnalysisException { long dataSize = partition.getDataSize(); Pair sizePair = DebugUtil.getByteUint(dataSize); String readableSize = DebugUtil.DECIMAL_FORMAT_SCALE_3.format(sizePair.first) + " " - + sizePair.second; + + sizePair.second; partitionInfo.add(readableSize); partitionInfos.add(partitionInfo); @@ -192,19 +327,13 @@ public ProcResult fetchResult() throws AnalysisException { } finally { db.readUnlock(); } + return partitionInfos; + } - // set result - BaseProcResult result = new BaseProcResult(); - result.setNames(TITLE_NAMES); - for (List info : partitionInfos) { - List row = new ArrayList(info.size()); - for (Comparable comparable : info) { - row.add(comparable.toString()); - } - result.addRow(row); - } - - return result; + @Override + public ProcResult fetchResult() throws AnalysisException { + List> partitionInfos = getPartitionInfos(); + return getBasicProcResult(partitionInfos); } @Override @@ -234,4 +363,13 @@ public ProcNodeInterface lookup(String partitionIdStr) throws AnalysisException } } + public static int analyzeColumn(String columnName) throws AnalysisException { + for (int i = 0; i < TITLE_NAMES.size(); ++i) { + if (TITLE_NAMES.get(i).equalsIgnoreCase(columnName)) { + return i; + } + } + ErrorReport.reportAnalysisException(ErrorCode.ERR_WRONG_COLUMN_NAME, columnName); + return -1; + } } diff --git a/fe/src/main/java/org/apache/doris/common/util/DynamicPartitionUtil.java b/fe/src/main/java/org/apache/doris/common/util/DynamicPartitionUtil.java new file mode 100644 index 00000000000000..83f5097a4d2e6f --- /dev/null +++ b/fe/src/main/java/org/apache/doris/common/util/DynamicPartitionUtil.java @@ -0,0 +1,267 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + + +package org.apache.doris.common.util; + +import com.google.common.base.Strings; +import org.apache.doris.analysis.TimestampArithmeticExpr.TimeUnit; +import org.apache.doris.catalog.Catalog; +import org.apache.doris.catalog.Column; +import org.apache.doris.catalog.DynamicPartitionProperty; +import org.apache.doris.catalog.OlapTable; +import org.apache.doris.catalog.Partition; +import org.apache.doris.catalog.PartitionInfo; +import org.apache.doris.catalog.PartitionType; +import org.apache.doris.catalog.PrimitiveType; +import org.apache.doris.catalog.RangePartitionInfo; +import org.apache.doris.catalog.Table; +import org.apache.doris.catalog.TableProperty; +import org.apache.doris.common.AnalysisException; +import org.apache.doris.common.DdlException; +import org.apache.doris.common.ErrorCode; +import org.apache.doris.common.ErrorReport; +import org.apache.doris.common.FeNameFormat; + +import java.text.SimpleDateFormat; +import java.util.Calendar; +import java.util.HashMap; +import java.util.Map; + +public class DynamicPartitionUtil { + private static final String TIMESTAMP_FORMAT = "yyyyMMdd"; + private static final String DATE_FORMAT = "yyyy-MM-dd"; + private static final String DATETIME_FORMAT = "yyyy-MM-dd HH:mm:ss"; + + public static void checkTimeUnit(String timeUnit) throws DdlException { + if (Strings.isNullOrEmpty(timeUnit) + || !(timeUnit.equalsIgnoreCase(TimeUnit.DAY.toString()) + || timeUnit.equalsIgnoreCase(TimeUnit.WEEK.toString()) + || timeUnit.equalsIgnoreCase(TimeUnit.MONTH.toString()))) { + ErrorReport.reportDdlException(ErrorCode.ERROR_DYNAMIC_PARTITION_TIME_UNIT, timeUnit); + } + } + + private static void checkPrefix(String prefix) throws DdlException { + try { + FeNameFormat.checkPartitionName(prefix); + } catch (AnalysisException e) { + ErrorReport.reportDdlException(ErrorCode.ERROR_DYNAMIC_PARTITION_PREFIX, prefix); + } + } + + private static void checkEnd(String end) throws DdlException { + if (Strings.isNullOrEmpty(end)) { + ErrorReport.reportDdlException(ErrorCode.ERROR_DYNAMIC_PARTITION_END_EMPTY); + } + try { + if (Integer.parseInt(end) <= 0) { + ErrorReport.reportDdlException(ErrorCode.ERROR_DYNAMIC_PARTITION_END_ZERO, end); + } + } catch (NumberFormatException e) { + ErrorReport.reportDdlException(ErrorCode.ERROR_DYNAMIC_PARTITION_END_FORMAT, end); + } + } + + private static void checkBuckets(String buckets) throws DdlException { + if (Strings.isNullOrEmpty(buckets)) { + ErrorReport.reportDdlException(ErrorCode.ERROR_DYNAMIC_PARTITION_BUCKETS_EMPTY); + } + try { + if (Integer.parseInt(buckets) <= 0) { + ErrorReport.reportDdlException(ErrorCode.ERROR_DYNAMIC_PARTITION_BUCKETS_ZERO, buckets); + } + } catch (NumberFormatException e) { + ErrorReport.reportDdlException(ErrorCode.ERROR_DYNAMIC_PARTITION_BUCKETS_FORMAT, buckets); + } + } + + private static void checkEnable(String enable) throws DdlException { + if (Strings.isNullOrEmpty(enable) + || (!Boolean.TRUE.toString().equalsIgnoreCase(enable) && !Boolean.FALSE.toString().equalsIgnoreCase(enable))) { + ErrorReport.reportDdlException(ErrorCode.ERROR_DYNAMIC_PARTITION_ENABLE, enable); + } + } + + public static boolean checkDynamicPartitionPropertiesExist(Map properties) { + if (properties == null) { + return false; + } + return properties.containsKey(DynamicPartitionProperty.TIME_UNIT) || + properties.containsKey(DynamicPartitionProperty.END) || + properties.containsKey(DynamicPartitionProperty.PREFIX) || + properties.containsKey(DynamicPartitionProperty.BUCKETS) || + properties.containsKey(DynamicPartitionProperty.ENABLE); + } + + public static boolean checkInputDynamicPartitionProperties(Map properties, PartitionInfo partitionInfo) throws DdlException{ + if (properties == null || properties.isEmpty()) { + return false; + } + if (partitionInfo.getType() != PartitionType.RANGE || partitionInfo.isMultiColumnPartition()) { + throw new DdlException("Dynamic partition only support single-column range partition"); + } + String timeUnit = properties.get(DynamicPartitionProperty.TIME_UNIT); + String prefix = properties.get(DynamicPartitionProperty.PREFIX); + String end = properties.get(DynamicPartitionProperty.END); + String buckets = properties.get(DynamicPartitionProperty.BUCKETS); + String enable = properties.get(DynamicPartitionProperty.ENABLE); + if (!((Strings.isNullOrEmpty(enable) && + Strings.isNullOrEmpty(timeUnit) && + Strings.isNullOrEmpty(prefix) && + Strings.isNullOrEmpty(end) && + Strings.isNullOrEmpty(buckets)))) { + if (Strings.isNullOrEmpty(enable)) { + throw new DdlException("Must assign dynamic_partition.enable properties"); + } + if (Strings.isNullOrEmpty(timeUnit)) { + throw new DdlException("Must assign dynamic_partition.time_unit properties"); + } + if (Strings.isNullOrEmpty(prefix)) { + throw new DdlException("Must assign dynamic_partition.prefix properties"); + } + if (Strings.isNullOrEmpty(end)) { + throw new DdlException("Must assign dynamic_partition.end properties"); + } + if (Strings.isNullOrEmpty(buckets)) { + throw new DdlException("Must assign dynamic_partition.buckets properties"); + } + } + return true; + } + + public static void registerOrRemoveDynamicPartitionTable(long dbId, OlapTable olapTable) { + if (olapTable.getTableProperty() != null + && olapTable.getTableProperty().getDynamicPartitionProperty() != null) { + if (olapTable.getTableProperty().getDynamicPartitionProperty().getEnable()) { + Catalog.getCurrentCatalog().getDynamicPartitionScheduler().registerDynamicPartitionTable(dbId, olapTable.getId()); + } else { + Catalog.getCurrentCatalog().getDynamicPartitionScheduler().removeDynamicPartitionTable(dbId, olapTable.getId()); + } + } + } + + public static Map analyzeDynamicPartition(Map properties) throws DdlException { + // properties should not be empty, check properties before call this function + Map analyzedProperties = new HashMap<>(); + if (properties.containsKey(DynamicPartitionProperty.TIME_UNIT)) { + String timeUnitValue = properties.get(DynamicPartitionProperty.TIME_UNIT); + checkTimeUnit(timeUnitValue); + properties.remove(DynamicPartitionProperty.TIME_UNIT); + analyzedProperties.put(DynamicPartitionProperty.TIME_UNIT, timeUnitValue); + } + if (properties.containsKey(DynamicPartitionProperty.PREFIX)) { + String prefixValue = properties.get(DynamicPartitionProperty.PREFIX); + checkPrefix(prefixValue); + properties.remove(DynamicPartitionProperty.PREFIX); + analyzedProperties.put(DynamicPartitionProperty.PREFIX, prefixValue); + } + if (properties.containsKey(DynamicPartitionProperty.END)) { + String endValue = properties.get(DynamicPartitionProperty.END); + checkEnd(endValue); + properties.remove(DynamicPartitionProperty.END); + analyzedProperties.put(DynamicPartitionProperty.END, endValue); + } + if (properties.containsKey(DynamicPartitionProperty.BUCKETS)) { + String bucketsValue = properties.get(DynamicPartitionProperty.BUCKETS); + checkBuckets(bucketsValue); + properties.remove(DynamicPartitionProperty.BUCKETS); + analyzedProperties.put(DynamicPartitionProperty.BUCKETS, bucketsValue); + } + if (properties.containsKey(DynamicPartitionProperty.ENABLE)) { + String enableValue = properties.get(DynamicPartitionProperty.ENABLE); + checkEnable(enableValue); + properties.remove(DynamicPartitionProperty.ENABLE); + analyzedProperties.put(DynamicPartitionProperty.ENABLE, enableValue); + } + return analyzedProperties; + } + + public static void checkAlterAllowed(OlapTable olapTable) throws DdlException { + TableProperty tableProperty = olapTable.getTableProperty(); + if (tableProperty != null && + tableProperty.getDynamicPartitionProperty().isExist() && + tableProperty.getDynamicPartitionProperty().getEnable()) { + throw new DdlException("Cannot modify partition on a Dynamic Partition Table, set `dynamic_partition.enable` to false firstly."); + } + } + + public static boolean isDynamicPartitionTable(Table table) { + if (!(table instanceof OlapTable) || + !(((OlapTable) table).getPartitionInfo().getType().equals(PartitionType.RANGE))) { + return false; + } + RangePartitionInfo rangePartitionInfo = (RangePartitionInfo) ((OlapTable) table).getPartitionInfo(); + TableProperty tableProperty = ((OlapTable) table).getTableProperty(); + if (tableProperty == null || !tableProperty.getDynamicPartitionProperty().isExist()) { + return false; + } + + return rangePartitionInfo.getPartitionColumns().size() == 1 && tableProperty.getDynamicPartitionProperty().getEnable(); + } + + /** + * properties should be checked before call this method + */ + public static void checkAndSetDynamicPartitionProperty(OlapTable olapTable, Map properties) throws DdlException { + if (DynamicPartitionUtil.checkInputDynamicPartitionProperties(properties, olapTable.getPartitionInfo())) { + Map dynamicPartitionProperties = DynamicPartitionUtil.analyzeDynamicPartition(properties); + olapTable.setTableProperty(new TableProperty(dynamicPartitionProperties).buildDynamicProperty()); + } + } + + public static String getPartitionFormat(Column column) throws DdlException { + if (column.getDataType().equals(PrimitiveType.DATE)) { + return DATE_FORMAT; + } else if (column.getDataType().equals(PrimitiveType.DATETIME)) { + return DATETIME_FORMAT; + } else if (PrimitiveType.getIntegerTypes().contains(column.getDataType())) { + // TODO: For Integer Type, only support format it as yyyyMMdd now + return TIMESTAMP_FORMAT; + } else { + throw new DdlException("Dynamic Partition Only Support DATE, DATETIME and INTEGER Type Now."); + } + } + + public static String getFormattedPartitionName(String name) { + return name.replace("-", "").replace(":", "").replace(" ", ""); + } + + public static String getPartitionRange(String timeUnit, int offset, Calendar calendar, String format) { + if (timeUnit.equalsIgnoreCase(TimeUnit.DAY.toString())) { + calendar.add(Calendar.DAY_OF_MONTH, offset); + } else if (timeUnit.equalsIgnoreCase(TimeUnit.WEEK.toString())) { + calendar.add(Calendar.WEEK_OF_MONTH, offset); + } else { + calendar.add(Calendar.MONTH, offset); + } + SimpleDateFormat dateFormat = new SimpleDateFormat(format); + return dateFormat.format(calendar.getTime()); + } + + public static int estimateReplicateNum(OlapTable table) { + int replicateNum = 3; + long maxPartitionId = 0; + for (Partition partition: table.getPartitions()) { + if (partition.getId() > maxPartitionId) { + maxPartitionId = partition.getId(); + replicateNum = table.getPartitionInfo().getReplicationNum(partition.getId()); + } + } + return replicateNum; + } +} \ No newline at end of file diff --git a/fe/src/main/java/org/apache/doris/journal/JournalEntity.java b/fe/src/main/java/org/apache/doris/journal/JournalEntity.java index 42824687156f1a..ddffb7332ec5da 100644 --- a/fe/src/main/java/org/apache/doris/journal/JournalEntity.java +++ b/fe/src/main/java/org/apache/doris/journal/JournalEntity.java @@ -54,6 +54,7 @@ import org.apache.doris.persist.DropInfo; import org.apache.doris.persist.DropLinkDbAndUpdateDbInfo; import org.apache.doris.persist.DropPartitionInfo; +import org.apache.doris.persist.ModifyDynamicPartitionInfo; import org.apache.doris.persist.HbPackage; import org.apache.doris.persist.ModifyPartitionInfo; import org.apache.doris.persist.OperationType; @@ -497,6 +498,11 @@ public void readFields(DataInput in) throws IOException { isRead = true; break; } + case OperationType.OP_DYNAMIC_PARTITION: { + data = ModifyDynamicPartitionInfo.read(in); + isRead = true; + break; + } default: { IOException e = new IOException(); LOG.error("UNKNOWN Operation Type {}", opCode, e); diff --git a/fe/src/main/java/org/apache/doris/master/ReportHandler.java b/fe/src/main/java/org/apache/doris/master/ReportHandler.java index b631ce941ac0cc..65d108cddfe9da 100644 --- a/fe/src/main/java/org/apache/doris/master/ReportHandler.java +++ b/fe/src/main/java/org/apache/doris/master/ReportHandler.java @@ -576,7 +576,8 @@ private static void deleteFromMeta(ListMultimap tabletDeleteFromMeta schemaHash, partition.getVisibleVersion(), partition.getVisibleVersionHash(), keysType, TStorageType.COLUMN, - TStorageMedium.HDD, columns, bfColumns, bfFpp, null); + TStorageMedium.HDD, columns, bfColumns, bfFpp, null, + olapTable.getCopiedIndexes()); createReplicaBatchTask.addTask(createReplicaTask); } else { // just set this replica as bad diff --git a/fe/src/main/java/org/apache/doris/mysql/MysqlChannel.java b/fe/src/main/java/org/apache/doris/mysql/MysqlChannel.java index bef48c111552f8..013d1759fc076b 100644 --- a/fe/src/main/java/org/apache/doris/mysql/MysqlChannel.java +++ b/fe/src/main/java/org/apache/doris/mysql/MysqlChannel.java @@ -33,25 +33,34 @@ public class MysqlChannel { // max length which one MySQL physical can hold, if one logical packet is bigger than this, // one packet will split to many packets - private static final int MAX_PHYSICAL_PACKET_LENGTH = 0xffffff - 1; + protected static final int MAX_PHYSICAL_PACKET_LENGTH = 0xffffff - 1; // MySQL packet header length - private static final int PACKET_HEADER_LEN = 4; + protected static final int PACKET_HEADER_LEN = 4; // logger for this class - private static final Logger LOG = LogManager.getLogger(MysqlChannel.class); + protected static final Logger LOG = LogManager.getLogger(MysqlChannel.class); // next sequence id to receive or send - private int sequenceId; + protected int sequenceId; // channel connected with client - private SocketChannel channel; + protected SocketChannel channel; // used to receive/send header, avoiding new this many time. - private ByteBuffer headerByteBuffer = ByteBuffer.allocate(PACKET_HEADER_LEN); + protected ByteBuffer headerByteBuffer = ByteBuffer.allocate(PACKET_HEADER_LEN); // default packet byte buffer for most packet - private ByteBuffer defaultBuffer = ByteBuffer.allocate(16 * 1024); - private ByteBuffer sendBuffer; + protected ByteBuffer defaultBuffer = ByteBuffer.allocate(16 * 1024); + protected ByteBuffer sendBuffer; // for log and show - private String remoteHostPortString; - private String remoteIp; - private boolean isSend; + protected String remoteHostPortString; + protected String remoteIp; + protected boolean isSend; + + protected MysqlChannel() { + this.sequenceId = 0; + this.sendBuffer = ByteBuffer.allocate(2 * 1024 * 1024); + this.isSend = false; + this.remoteHostPortString = ""; + this.remoteIp = ""; + } + public MysqlChannel(SocketChannel channel) { this.sequenceId = 0; this.channel = channel; @@ -112,7 +121,7 @@ public void close() { } } - private int readAll(ByteBuffer dstBuf) throws IOException { + protected int readAll(ByteBuffer dstBuf) throws IOException { int readLen = 0; while (dstBuf.remaining() != 0) { int ret = channel.read(dstBuf); @@ -178,7 +187,7 @@ public ByteBuffer fetchOnePacket() throws IOException { return result; } - private void realNetSend(ByteBuffer buffer) throws IOException { + protected void realNetSend(ByteBuffer buffer) throws IOException { long bufLen = buffer.remaining(); long writeLen = channel.write(buffer); if (bufLen != writeLen) { diff --git a/fe/src/main/java/org/apache/doris/mysql/MysqlServer.java b/fe/src/main/java/org/apache/doris/mysql/MysqlServer.java index af6fd714fe9f68..af846f32e3f840 100644 --- a/fe/src/main/java/org/apache/doris/mysql/MysqlServer.java +++ b/fe/src/main/java/org/apache/doris/mysql/MysqlServer.java @@ -20,7 +20,6 @@ import org.apache.doris.catalog.Catalog; import org.apache.doris.qe.ConnectContext; import org.apache.doris.qe.ConnectScheduler; - import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -33,18 +32,21 @@ public class MysqlServer { private static final Logger LOG = LogManager.getLogger(MysqlServer.class); - private int port; + protected int port; + protected volatile boolean running; private ServerSocketChannel serverChannel = null; private ConnectScheduler scheduler = null; // used to accept connect request from client private Thread listener; - private volatile boolean running; public MysqlServer(int port, ConnectScheduler scheduler) { this.port = port; this.scheduler = scheduler; } + protected MysqlServer() { + } + // start MySQL protocol service // return true if success, otherwise false public boolean start() { @@ -140,5 +142,5 @@ public ConnectScheduler getScheduler() { public void setScheduler(ConnectScheduler scheduler) { this.scheduler = scheduler; } - + } diff --git a/fe/src/main/java/org/apache/doris/mysql/nio/AcceptListener.java b/fe/src/main/java/org/apache/doris/mysql/nio/AcceptListener.java new file mode 100644 index 00000000000000..97e81224309811 --- /dev/null +++ b/fe/src/main/java/org/apache/doris/mysql/nio/AcceptListener.java @@ -0,0 +1,86 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.apache.doris.mysql.nio; + +import org.apache.doris.catalog.Catalog; +import org.apache.doris.mysql.MysqlProto; +import org.apache.doris.qe.ConnectContext; +import org.apache.doris.qe.ConnectProcessor; +import org.apache.doris.qe.ConnectScheduler; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.xnio.ChannelListener; +import org.xnio.StreamConnection; +import org.xnio.channels.AcceptingChannel; + +import java.io.IOException; + +/** + * listener for accept mysql connections. + */ +public class AcceptListener implements ChannelListener> { + private final Logger LOG = LogManager.getLogger(this.getClass()); + private ConnectScheduler connectScheduler; + + public AcceptListener(ConnectScheduler connectScheduler) { + this.connectScheduler = connectScheduler; + } + + @Override + public void handleEvent(AcceptingChannel channel) { + try { + StreamConnection connection = channel.accept(); + if (connection == null) { + return; + } + LOG.info("Connection established. remote={}", connection.getPeerAddress()); + NConnectContext context = new NConnectContext(connection); + context.setCatalog(Catalog.getInstance()); + connectScheduler.submit(context); + + channel.getWorker().execute(() -> { + try { + // Set thread local info + context.setThreadLocalInfo(); + context.setConnectScheduler(connectScheduler); + // authenticate check failed. + if (!MysqlProto.negotiate(context)) { + return; + } + if (connectScheduler.registerConnection(context)) { + MysqlProto.sendResponsePacket(context); + connection.setCloseListener(streamConnection -> connectScheduler.unregisterConnection(context)); + } else { + context.getState().setError("Reach limit of connections"); + MysqlProto.sendResponsePacket(context); + return; + } + context.setStartTime(); + ConnectProcessor processor = new ConnectProcessor(context); + context.startAcceptQuery(processor); + } catch (Exception e) { + LOG.warn("connect processor exception because ", e); + context.cleanup(); + } finally { + ConnectContext.remove(); + } + }); + } catch (IOException e) { + LOG.warn("Connection accept failed.", e); + } + } +} diff --git a/fe/src/main/java/org/apache/doris/mysql/nio/NConnectContext.java b/fe/src/main/java/org/apache/doris/mysql/nio/NConnectContext.java new file mode 100644 index 00000000000000..fbb8707696e3ec --- /dev/null +++ b/fe/src/main/java/org/apache/doris/mysql/nio/NConnectContext.java @@ -0,0 +1,62 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.apache.doris.mysql.nio; + +import org.apache.doris.qe.ConnectContext; +import org.apache.doris.qe.ConnectProcessor; +import org.xnio.StreamConnection; + +import java.io.IOException; + +/** + * connect context based on nio. + */ +public class NConnectContext extends ConnectContext { + protected NMysqlChannel mysqlChannel; + + public NConnectContext(StreamConnection connection) { + super(); + mysqlChannel = new NMysqlChannel(connection); + } + + @Override + public void cleanup() { + mysqlChannel.close(); + returnRows = 0; + } + + @Override + public NMysqlChannel getMysqlChannel() { + return mysqlChannel; + } + + public void startAcceptQuery(ConnectProcessor connectProcessor) { + mysqlChannel.startAcceptQuery(this, connectProcessor); + } + + public void suspendAcceptQuery() { + mysqlChannel.suspendAcceptQuery(); + } + + public void resumeAcceptQuery() { + mysqlChannel.resumeAcceptQuery(); + } + + public void stopAcceptQuery() throws IOException { + mysqlChannel.stopAcceptQuery(); + } +} diff --git a/fe/src/main/java/org/apache/doris/mysql/nio/NMysqlChannel.java b/fe/src/main/java/org/apache/doris/mysql/nio/NMysqlChannel.java new file mode 100644 index 00000000000000..04af03dd89ddb8 --- /dev/null +++ b/fe/src/main/java/org/apache/doris/mysql/nio/NMysqlChannel.java @@ -0,0 +1,116 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.apache.doris.mysql.nio; + +import org.apache.doris.mysql.MysqlChannel; +import org.apache.doris.qe.ConnectProcessor; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.xnio.StreamConnection; +import org.xnio.channels.Channels; + +import java.io.IOException; +import java.net.InetSocketAddress; +import java.nio.ByteBuffer; + +/** + * mysql Channel based on nio. + */ +public class NMysqlChannel extends MysqlChannel { + protected final Logger LOG = LogManager.getLogger(this.getClass()); + private StreamConnection conn; + + public NMysqlChannel(StreamConnection connection) { + super(); + this.conn = connection; + if (connection.getPeerAddress() instanceof InetSocketAddress) { + InetSocketAddress address = (InetSocketAddress) connection.getPeerAddress(); + remoteHostPortString = address.getHostString() + ":" + address.getPort(); + remoteIp = address.getAddress().getHostAddress(); + } else { + // Reach here, what's it? + remoteHostPortString = connection.getPeerAddress().toString(); + remoteIp = connection.getPeerAddress().toString(); + } + } + + /** + * read packet until whole dstBuf is filled, unless block. + * Todo: find a better way to avoid block read here. + * + * @param dstBuf + * @return + * @throws IOException + */ + @Override + protected int readAll(ByteBuffer dstBuf) throws IOException { + int readLen = 0; + while (dstBuf.remaining() != 0) { + int ret = Channels.readBlocking(conn.getSourceChannel(), dstBuf); + // return -1 when remote peer close the channel + if (ret == -1) { + return readLen; + } + readLen += ret; + } + return readLen; + } + + /** + * write packet until no data is remained, unless block. + * + * @param buffer + * @throws IOException + */ + @Override + protected void realNetSend(ByteBuffer buffer) throws IOException { + long bufLen = buffer.remaining(); + long writeLen = Channels.writeBlocking(conn.getSinkChannel(), buffer); + if (bufLen != writeLen) { + throw new IOException("Write mysql packet failed.[write=" + writeLen + + ", needToWrite=" + bufLen + "]"); + } + Channels.flushBlocking(conn.getSinkChannel()); + isSend = true; + } + + @Override + public void close() { + try { + conn.close(); + } catch (IOException e) { + LOG.warn("Close channel exception, ignore."); + } + } + + public void startAcceptQuery(NConnectContext nConnectContext, ConnectProcessor connectProcessor) { + conn.getSourceChannel().setReadListener(new ReadListener(nConnectContext, connectProcessor)); + conn.getSourceChannel().resumeReads(); + } + + public void suspendAcceptQuery() { + conn.getSourceChannel().suspendReads(); + } + + public void resumeAcceptQuery() { + conn.getSourceChannel().resumeReads(); + } + + public void stopAcceptQuery() throws IOException { + conn.getSourceChannel().shutdownReads(); + } +} diff --git a/fe/src/main/java/org/apache/doris/mysql/nio/NMysqlServer.java b/fe/src/main/java/org/apache/doris/mysql/nio/NMysqlServer.java new file mode 100644 index 00000000000000..5abc89eff77a11 --- /dev/null +++ b/fe/src/main/java/org/apache/doris/mysql/nio/NMysqlServer.java @@ -0,0 +1,95 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.apache.doris.mysql.nio; + +import com.google.common.util.concurrent.ThreadFactoryBuilder; +import org.apache.doris.common.Config; +import org.apache.doris.mysql.MysqlServer; +import org.apache.doris.qe.ConnectScheduler; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.xnio.OptionMap; +import org.xnio.Options; +import org.xnio.StreamConnection; +import org.xnio.Xnio; +import org.xnio.XnioWorker; +import org.xnio.channels.AcceptingChannel; + +import java.io.IOException; +import java.net.InetSocketAddress; +import java.util.concurrent.ExecutorService; +import java.util.concurrent.Executors; + +/** + * mysql protocol implementation based on nio. + */ +public class NMysqlServer extends MysqlServer { + private final Logger LOG = LogManager.getLogger(this.getClass()); + + private XnioWorker xnioWorker; + + private AcceptListener acceptListener; + + private AcceptingChannel server; + + // default task service. + private ExecutorService taskService = Executors.newCachedThreadPool((new ThreadFactoryBuilder().setDaemon(false).setNameFormat("doris-mysql-nio TASK").build())); + + public NMysqlServer(int port, ConnectScheduler connectScheduler) { + this.port = port; + this.xnioWorker = Xnio.getInstance().createWorkerBuilder() + .setWorkerName("doris-mysql-nio") + .setWorkerIoThreads(Config.mysql_service_io_threads_num) + .setExternalExecutorService(taskService).build(); + // connectScheduler only used for idle check. + this.acceptListener = new AcceptListener(connectScheduler); + } + + // start MySQL protocol service + // return true if success, otherwise false + @Override + public boolean start() { + try { + server = xnioWorker.createStreamConnectionServer(new InetSocketAddress(port), + acceptListener, OptionMap.create(Options.TCP_NODELAY, true)); + server.resumeAccepts(); + running = true; + LOG.info("Open mysql server success on {}", port); + return true; + } catch (IOException e) { + LOG.warn("Open MySQL network service failed.", e); + return false; + } + } + + @Override + public void stop() { + if (running) { + running = false; + // close server channel, make accept throw exception + try { + server.close(); + } catch (IOException e) { + LOG.warn("close server channel failed.", e); + } + } + } + + public void setTaskService(ExecutorService taskService) { + this.taskService = taskService; + } +} diff --git a/fe/src/main/java/org/apache/doris/mysql/nio/ReadListener.java b/fe/src/main/java/org/apache/doris/mysql/nio/ReadListener.java new file mode 100644 index 00000000000000..724d5fbf2b5295 --- /dev/null +++ b/fe/src/main/java/org/apache/doris/mysql/nio/ReadListener.java @@ -0,0 +1,66 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. +package org.apache.doris.mysql.nio; + +import org.apache.doris.qe.ConnectContext; +import org.apache.doris.qe.ConnectProcessor; +import org.apache.logging.log4j.LogManager; +import org.apache.logging.log4j.Logger; +import org.xnio.ChannelListener; +import org.xnio.XnioIoThread; +import org.xnio.conduits.ConduitStreamSourceChannel; + +/** + * listener for handle mysql cmd. + */ +public class ReadListener implements ChannelListener { + private final Logger LOG = LogManager.getLogger(this.getClass()); + private NConnectContext ctx; + private ConnectProcessor connectProcessor; + + public ReadListener(NConnectContext nConnectContext, ConnectProcessor connectProcessor) { + this.ctx = nConnectContext; + this.connectProcessor = connectProcessor; + } + + @Override + public void handleEvent(ConduitStreamSourceChannel channel) { + // suspend must be call sync in current thread (the IO-Thread notify the read event), + // otherwise multi handler(task thread) would be waked up by once query. + XnioIoThread.requireCurrentThread(); + ctx.suspendAcceptQuery(); + // start async query handle in task thread. + channel.getWorker().execute(() -> { + ctx.setThreadLocalInfo(); + try { + connectProcessor.processOnce(); + if (!ctx.isKilled()) { + ctx.resumeAcceptQuery(); + } else { + ctx.stopAcceptQuery(); + ctx.cleanup(); + } + } catch (Exception e) { + LOG.warn("Exception happened in one session(" + ctx + ").", e); + ctx.setKilled(); + ctx.cleanup(); + } finally { + ConnectContext.remove(); + } + }); + } +} diff --git a/fe/src/main/java/org/apache/doris/persist/EditLog.java b/fe/src/main/java/org/apache/doris/persist/EditLog.java index 3d6c01685280e8..2e26de11241467 100644 --- a/fe/src/main/java/org/apache/doris/persist/EditLog.java +++ b/fe/src/main/java/org/apache/doris/persist/EditLog.java @@ -697,6 +697,11 @@ public static void loadJournal(Catalog catalog, JournalEntity journal) { catalog.replayConvertDistributionType(tableInfo); break; } + case OperationType.OP_DYNAMIC_PARTITION: { + ModifyDynamicPartitionInfo modifyDynamicPartitionInfo = (ModifyDynamicPartitionInfo) journal.getData(); + catalog.replayModifyTableDynamicPartition(modifyDynamicPartitionInfo); + break; + } default: { IOException e = new IOException(); LOG.error("UNKNOWN Operation Type {}", opCode, e); @@ -1195,7 +1200,11 @@ public void logAlterJob(AlterJobV2 alterJob) { logEdit(OperationType.OP_ALTER_JOB_V2, alterJob); } - public void logModifyDitrubutionType(TableInfo tableInfo) { + public void logModifyDistributionType(TableInfo tableInfo) { logEdit(OperationType.OP_MODIFY_DISTRIBUTION_TYPE, tableInfo); } + + public void logDynamicPartition(ModifyDynamicPartitionInfo info) { + logEdit(OperationType.OP_DYNAMIC_PARTITION, info); + } } diff --git a/fe/src/main/java/org/apache/doris/persist/ModifyDynamicPartitionInfo.java b/fe/src/main/java/org/apache/doris/persist/ModifyDynamicPartitionInfo.java new file mode 100644 index 00000000000000..90051bec762bcc --- /dev/null +++ b/fe/src/main/java/org/apache/doris/persist/ModifyDynamicPartitionInfo.java @@ -0,0 +1,66 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.persist; + +import com.google.gson.annotations.SerializedName; +import org.apache.doris.common.io.Text; +import org.apache.doris.common.io.Writable; +import org.apache.doris.persist.gson.GsonUtils; + +import java.io.DataInput; +import java.io.DataOutput; +import java.io.IOException; +import java.util.HashMap; +import java.util.Map; + +public class ModifyDynamicPartitionInfo implements Writable { + + @SerializedName(value = "dbId") + private long dbId; + @SerializedName(value = "tableId") + private long tableId; + @SerializedName(value = "properties") + private Map properties = new HashMap<>(); + + public ModifyDynamicPartitionInfo(long dbId, long tableId, Map properties) { + this.dbId = dbId; + this.tableId = tableId; + this.properties = properties; + } + + public long getDbId() { + return dbId; + } + + public long getTableId() { + return tableId; + } + + public Map getProperties() { + return properties; + } + + @Override + public void write(DataOutput out) throws IOException { + Text.writeString(out, GsonUtils.GSON.toJson(this)); + } + + public static ModifyDynamicPartitionInfo read(DataInput in) throws IOException { + return GsonUtils.GSON.fromJson(Text.readString(in), ModifyDynamicPartitionInfo.class); + } +} diff --git a/fe/src/main/java/org/apache/doris/persist/OperationType.java b/fe/src/main/java/org/apache/doris/persist/OperationType.java index f6a83527860469..0e117a0d79cc8d 100644 --- a/fe/src/main/java/org/apache/doris/persist/OperationType.java +++ b/fe/src/main/java/org/apache/doris/persist/OperationType.java @@ -153,4 +153,7 @@ public class OperationType { // small files 251~260 public static final short OP_CREATE_SMALL_FILE = 251; public static final short OP_DROP_SMALL_FILE = 252; + + // dynamic partition 261~265 + public static final short OP_DYNAMIC_PARTITION = 261; } diff --git a/fe/src/main/java/org/apache/doris/planner/SingleNodePlanner.java b/fe/src/main/java/org/apache/doris/planner/SingleNodePlanner.java index 8a4a7b4a36f639..09ed92505be4e9 100644 --- a/fe/src/main/java/org/apache/doris/planner/SingleNodePlanner.java +++ b/fe/src/main/java/org/apache/doris/planner/SingleNodePlanner.java @@ -555,7 +555,8 @@ private void turnOffPreAgg(AggregateInfo aggInfo, SelectStmt selectStmt, Analyze returnColumnValidate = false; break; } - } else if (aggExpr.getFnName().getFunction().equalsIgnoreCase(FunctionSet.BITMAP_UNION)) { + } else if (aggExpr.getFnName().getFunction().equalsIgnoreCase(FunctionSet.BITMAP_UNION) + || aggExpr.getFnName().getFunction().equalsIgnoreCase(FunctionSet.BITMAP_UNION_COUNT)) { if (col.getAggregationType() != AggregateType.BITMAP_UNION) { turnOffReason = "Aggregate Operator not match: BITMAP_UNION <--> " + col.getAggregationType(); returnColumnValidate = false; diff --git a/fe/src/main/java/org/apache/doris/qe/ConnectContext.java b/fe/src/main/java/org/apache/doris/qe/ConnectContext.java index 12e6afa0a698b7..e69c59031de06f 100644 --- a/fe/src/main/java/org/apache/doris/qe/ConnectContext.java +++ b/fe/src/main/java/org/apache/doris/qe/ConnectContext.java @@ -41,60 +41,60 @@ // Use `volatile` to make the reference change atomic. public class ConnectContext { private static final Logger LOG = LogManager.getLogger(ConnectContext.class); - private static ThreadLocal threadLocalInfo = new ThreadLocal(); + protected static ThreadLocal threadLocalInfo = new ThreadLocal(); // set this id before analyze - private volatile long stmtId; - private volatile long forwardedStmtId; + protected volatile long stmtId; + protected volatile long forwardedStmtId; - private volatile TUniqueId queryId; + protected volatile TUniqueId queryId; // id for this connection - private volatile int connectionId; + protected volatile int connectionId; // mysql net - private volatile MysqlChannel mysqlChannel; + protected volatile MysqlChannel mysqlChannel; // state - private volatile QueryState state; - private volatile long returnRows; + protected volatile QueryState state; + protected volatile long returnRows; // the protocol capability which server say it can support - private volatile MysqlCapability serverCapability; + protected volatile MysqlCapability serverCapability; // the protocol capability after server and client negotiate - private volatile MysqlCapability capability; + protected volatile MysqlCapability capability; // Indicate if this client is killed. - private volatile boolean isKilled; + protected volatile boolean isKilled; // Db - private volatile String currentDb = ""; + protected volatile String currentDb = ""; // cluster name - private volatile String clusterName = ""; + protected volatile String clusterName = ""; // username@host of current login user - private volatile String qualifiedUser; + protected volatile String qualifiedUser; // username@host combination for the Doris account // that the server used to authenticate the current client. // In other word, currentUserIdentity is the entry that matched in Doris auth table. // This account determines user's access privileges. - private volatile UserIdentity currentUserIdentity; + protected volatile UserIdentity currentUserIdentity; // Serializer used to pack MySQL packet. - private volatile MysqlSerializer serializer; + protected volatile MysqlSerializer serializer; // Variables belong to this session. - private volatile SessionVariable sessionVariable; + protected volatile SessionVariable sessionVariable; // Scheduler this connection belongs to - private volatile ConnectScheduler connectScheduler; + protected volatile ConnectScheduler connectScheduler; // Executor - private volatile StmtExecutor executor; + protected volatile StmtExecutor executor; // Command this connection is processing. - private volatile MysqlCommand command; + protected volatile MysqlCommand command; // Timestamp in millisecond last command starts at - private volatile long startTime; + protected volatile long startTime; // Cache thread info for this connection. - private volatile ThreadInfo threadInfo; + protected volatile ThreadInfo threadInfo; // Catalog: put catalog here is convenient for unit test, // because catalog is singleton, hard to mock - private Catalog catalog; - private boolean isSend; + protected Catalog catalog; + protected boolean isSend; - private AuditBuilder auditBuilder; + protected AuditBuilder auditBuilder; - private String remoteIP; + protected String remoteIP; public static ConnectContext get() { return threadLocalInfo.get(); @@ -112,6 +112,17 @@ public boolean isSend() { return this.isSend; } + public ConnectContext() { + state = new QueryState(); + returnRows = 0; + serverCapability = MysqlCapability.DEFAULT_CAPABILITY; + isKilled = false; + serializer = MysqlSerializer.newInstance(); + sessionVariable = VariableMgr.newSessionVariable(); + auditBuilder = new AuditBuilder(); + command = MysqlCommand.COM_SLEEP; + } + public ConnectContext(SocketChannel channel) { state = new QueryState(); returnRows = 0; diff --git a/fe/src/main/java/org/apache/doris/qe/ConnectScheduler.java b/fe/src/main/java/org/apache/doris/qe/ConnectScheduler.java index a7d08df933c07e..2afb973c5a1a8a 100644 --- a/fe/src/main/java/org/apache/doris/qe/ConnectScheduler.java +++ b/fe/src/main/java/org/apache/doris/qe/ConnectScheduler.java @@ -19,6 +19,7 @@ import org.apache.doris.catalog.Catalog; import org.apache.doris.mysql.MysqlProto; +import org.apache.doris.mysql.nio.NConnectContext; import org.apache.doris.mysql.privilege.PrivPredicate; import com.google.common.collect.Lists; @@ -81,6 +82,10 @@ public boolean submit(ConnectContext context) { return false; } context.setConnectionId(nextConnectionId.getAndAdd(1)); + // no necessary for nio. + if(context instanceof NConnectContext){ + return true; + } if (executor.submit(new LoopHandler(context)) == null) { LOG.warn("Submit one thread failed."); return false; diff --git a/fe/src/main/java/org/apache/doris/qe/QeService.java b/fe/src/main/java/org/apache/doris/qe/QeService.java index 8f683dc1d35f31..dd8ed879c1f6a0 100644 --- a/fe/src/main/java/org/apache/doris/qe/QeService.java +++ b/fe/src/main/java/org/apache/doris/qe/QeService.java @@ -18,7 +18,7 @@ package org.apache.doris.qe; import org.apache.doris.mysql.MysqlServer; - +import org.apache.doris.mysql.nio.NMysqlServer; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -37,7 +37,7 @@ public QeService(int port) { this.port = port; } - public QeService(int port, ConnectScheduler scheduler) { + public QeService(int port, boolean nioEnabled, ConnectScheduler scheduler) { // Set up help module try { HelpModule.getInstance().setUpModule(); @@ -45,7 +45,11 @@ public QeService(int port, ConnectScheduler scheduler) { LOG.error("Help module failed, because:", e); } this.port = port; - mysqlServer = new MysqlServer(port, scheduler); + if (nioEnabled) { + mysqlServer = new NMysqlServer(port, scheduler); + } else { + mysqlServer = new MysqlServer(port, scheduler); + } } public void start() throws IOException { diff --git a/fe/src/main/java/org/apache/doris/qe/ShowExecutor.java b/fe/src/main/java/org/apache/doris/qe/ShowExecutor.java index 5e6e4fb8befa6a..3f5f3ec10f51d1 100644 --- a/fe/src/main/java/org/apache/doris/qe/ShowExecutor.java +++ b/fe/src/main/java/org/apache/doris/qe/ShowExecutor.java @@ -35,11 +35,13 @@ import org.apache.doris.analysis.ShowDataStmt; import org.apache.doris.analysis.ShowDbStmt; import org.apache.doris.analysis.ShowDeleteStmt; +import org.apache.doris.analysis.ShowDynamicPartitionStmt; import org.apache.doris.analysis.ShowEnginesStmt; import org.apache.doris.analysis.ShowExportStmt; import org.apache.doris.analysis.ShowFrontendsStmt; import org.apache.doris.analysis.ShowFunctionStmt; import org.apache.doris.analysis.ShowGrantsStmt; +import org.apache.doris.analysis.ShowIndexStmt; import org.apache.doris.analysis.ShowLoadStmt; import org.apache.doris.analysis.ShowLoadWarningsStmt; import org.apache.doris.analysis.ShowMigrationsStmt; @@ -68,7 +70,9 @@ import org.apache.doris.catalog.Catalog; import org.apache.doris.catalog.Column; import org.apache.doris.catalog.Database; +import org.apache.doris.catalog.DynamicPartitionProperty; import org.apache.doris.catalog.Function; +import org.apache.doris.catalog.Index; import org.apache.doris.catalog.MaterializedIndex; import org.apache.doris.catalog.MaterializedIndex.IndexExtState; import org.apache.doris.catalog.MetadataViewer; @@ -81,6 +85,7 @@ import org.apache.doris.catalog.TabletInvertedIndex; import org.apache.doris.catalog.Type; import org.apache.doris.catalog.View; +import org.apache.doris.clone.DynamicPartitionScheduler; import org.apache.doris.cluster.BaseParam; import org.apache.doris.cluster.ClusterNamespace; import org.apache.doris.common.AnalysisException; @@ -91,13 +96,13 @@ import org.apache.doris.common.ErrorReport; import org.apache.doris.common.MetaNotFoundException; import org.apache.doris.common.PatternMatcher; -import org.apache.doris.common.proc.ProcNodeInterface; import org.apache.doris.common.proc.BackendsProcDir; +import org.apache.doris.common.proc.FrontendsProcNode; import org.apache.doris.common.proc.LoadProcDir; -import org.apache.doris.common.proc.SchemaChangeProcNode; import org.apache.doris.common.proc.PartitionsProcDir; +import org.apache.doris.common.proc.ProcNodeInterface; +import org.apache.doris.common.proc.SchemaChangeProcNode; import org.apache.doris.common.proc.TabletsProcDir; -import org.apache.doris.common.proc.FrontendsProcNode; import org.apache.doris.common.util.ListComparator; import org.apache.doris.common.util.LogBuilder; import org.apache.doris.common.util.LogKey; @@ -131,7 +136,6 @@ import java.util.ArrayList; import java.util.Collection; import java.util.Collections; -import java.util.Iterator; import java.util.List; import java.util.Map; import java.util.Set; @@ -237,6 +241,10 @@ public ShowResultSet execute() throws AnalysisException { handleAdminShowConfig(); } else if (stmt instanceof ShowSmallFilesStmt) { handleShowSmallFiles(); + } else if (stmt instanceof ShowDynamicPartitionStmt) { + handleShowDynamicPartition(); + } else if (stmt instanceof ShowIndexStmt) { + handleShowIndex(); } else { handleEmtpy(); } @@ -639,6 +647,34 @@ private void handleShowColumn() throws AnalysisException { resultSet = new ShowResultSet(showStmt.getMetaData(), rows); } + // Show index statement. + private void handleShowIndex() throws AnalysisException { + ShowIndexStmt showStmt = (ShowIndexStmt) stmt; + List> rows = Lists.newArrayList(); + Database db = ctx.getCatalog().getDb(showStmt.getDbName()); + if (db == null) { + ErrorReport.reportAnalysisException(ErrorCode.ERR_BAD_TABLE_ERROR, showStmt.getTableName().toString()); + } + db.readLock(); + try { + Table table = db.getTable(showStmt.getTableName().getTbl()); + if (table != null && table instanceof OlapTable) { + List indexes = ((OlapTable) table).getIndexes(); + for (Index index : indexes) { + rows.add(Lists.newArrayList(showStmt.getTableName().toString(), index.getIndexName(), + index.getColumns().stream().collect(Collectors.joining(",")), + index.getIndexType().name(), index.getComment())); + } + } else { + ErrorReport.reportAnalysisException(ErrorCode.ERR_BAD_TABLE_ERROR, + db.getFullName() + "." + showStmt.getTableName().toString()); + } + } finally { + db.readUnlock(); + } + resultSet = new ShowResultSet(showStmt.getMetaData(), rows); + } + // Handle help statement. private void handleHelp() { HelpStmt helpStmt = (HelpStmt) stmt; @@ -1054,26 +1090,10 @@ private void handleShowData() throws AnalysisException { private void handleShowPartitions() throws AnalysisException { ShowPartitionsStmt showStmt = (ShowPartitionsStmt) stmt; - ProcNodeInterface procNodeI = showStmt.getNode(); Preconditions.checkNotNull(procNodeI); - List> rows = procNodeI.fetchResult().getRows(); - - if (showStmt.getPartitionName() != null) { - // filter by partition name - List> oneRow = Lists.newArrayList(); - String partitionName = showStmt.getPartitionName(); - Iterator> iter = rows.iterator(); - while (iter.hasNext()) { - List row = iter.next(); - if (row.get(PartitionsProcDir.PARTITION_NAME_INDEX).equalsIgnoreCase(partitionName)) { - oneRow.add(row); - break; - } - } - rows = oneRow; - } - + List> rows = ((PartitionsProcDir) procNodeI).fetchResultByFilter(showStmt.getFilterMap(), + showStmt.getOrderByPairs(), showStmt.getLimitElement()).getRows(); resultSet = new ShowResultSet(showStmt.getMetaData(), rows); } @@ -1423,6 +1443,50 @@ private void handleShowSmallFiles() throws AnalysisException { resultSet = new ShowResultSet(showStmt.getMetaData(), results); } + private void handleShowDynamicPartition() { + ShowDynamicPartitionStmt showDynamicPartitionStmt = (ShowDynamicPartitionStmt) stmt; + List> rows = Lists.newArrayList(); + Database db = ctx.getCatalog().getDb(showDynamicPartitionStmt.getDb()); + if (db != null) { + db.readLock(); + try { + for (Table tbl : db.getTables()) { + if (!(tbl instanceof OlapTable)) { + continue; + } + + DynamicPartitionScheduler dynamicPartitionScheduler = Catalog.getCurrentCatalog().getDynamicPartitionScheduler(); + OlapTable olapTable = (OlapTable) tbl; + if (!olapTable.dynamicPartitionExists()) { + dynamicPartitionScheduler.removeRuntimeInfo(olapTable.getName()); + continue; + } + // check tbl privs + if (!Catalog.getCurrentCatalog().getAuth().checkTblPriv(ConnectContext.get(), + db.getFullName(), olapTable.getName(), + PrivPredicate.SHOW)) { + continue; + } + DynamicPartitionProperty dynamicPartitionProperty = olapTable.getTableProperty().getDynamicPartitionProperty(); + String tableName = olapTable.getName(); + rows.add(Lists.newArrayList( + tableName, + String.valueOf(dynamicPartitionProperty.getEnable()), + dynamicPartitionProperty.getTimeUnit().toUpperCase(), + String.valueOf(dynamicPartitionProperty.getEnd()), + dynamicPartitionProperty.getPrefix(), + String.valueOf(dynamicPartitionProperty.getBuckets()), + dynamicPartitionScheduler.getRuntimeInfo(tableName, DynamicPartitionScheduler.LAST_UPDATE_TIME), + dynamicPartitionScheduler.getRuntimeInfo(tableName, DynamicPartitionScheduler.LAST_SCHEDULER_TIME), + dynamicPartitionScheduler.getRuntimeInfo(tableName, DynamicPartitionScheduler.DYNAMIC_PARTITION_STATE), + dynamicPartitionScheduler.getRuntimeInfo(tableName, DynamicPartitionScheduler.MSG))); + } + } finally { + db.readUnlock(); + } + resultSet = new ShowResultSet(showDynamicPartitionStmt.getMetaData(), rows); + } + } } diff --git a/fe/src/main/java/org/apache/doris/task/CreateReplicaTask.java b/fe/src/main/java/org/apache/doris/task/CreateReplicaTask.java index 08dc5858702aa8..20c7777d5bdcc1 100644 --- a/fe/src/main/java/org/apache/doris/task/CreateReplicaTask.java +++ b/fe/src/main/java/org/apache/doris/task/CreateReplicaTask.java @@ -19,11 +19,13 @@ import org.apache.doris.alter.SchemaChangeHandler; import org.apache.doris.catalog.Column; +import org.apache.doris.catalog.Index; import org.apache.doris.catalog.KeysType; import org.apache.doris.common.MarkedCountDownLatch; import org.apache.doris.common.Status; import org.apache.doris.thrift.TColumn; import org.apache.doris.thrift.TCreateTabletReq; +import org.apache.doris.thrift.TOlapTableIndex; import org.apache.doris.thrift.TStatusCode; import org.apache.doris.thrift.TStorageMedium; import org.apache.doris.thrift.TStorageType; @@ -31,6 +33,7 @@ import org.apache.doris.thrift.TTaskType; import org.apache.doris.thrift.TStorageFormat; +import org.apache.commons.collections.CollectionUtils; import org.apache.logging.log4j.LogManager; import org.apache.logging.log4j.Logger; @@ -57,6 +60,9 @@ public class CreateReplicaTask extends AgentTask { private Set bfColumns; private double bfFpp; + // indexes + private List indexes; + // used for synchronous process private MarkedCountDownLatch latch; @@ -72,7 +78,8 @@ public CreateReplicaTask(long backendId, long dbId, long tableId, long partition short shortKeyColumnCount, int schemaHash, long version, long versionHash, KeysType keysType, TStorageType storageType, TStorageMedium storageMedium, List columns, - Set bfColumns, double bfFpp, MarkedCountDownLatch latch) { + Set bfColumns, double bfFpp, MarkedCountDownLatch latch, + List indexes) { super(null, backendId, TTaskType.CREATE, dbId, tableId, partitionId, indexId, tabletId); this.shortKeyColumnCount = shortKeyColumnCount; @@ -88,6 +95,7 @@ public CreateReplicaTask(long backendId, long dbId, long tableId, long partition this.columns = columns; this.bfColumns = bfColumns; + this.indexes = indexes; this.bfFpp = bfFpp; this.latch = latch; @@ -153,6 +161,15 @@ public TCreateTabletReq toThrift() { } tSchema.setColumns(tColumns); + if (CollectionUtils.isNotEmpty(indexes)) { + List tIndexes = new ArrayList<>(); + for (Index index : indexes) { + tIndexes.add(index.toThrift()); + } + tSchema.setIndexes(tIndexes); + storageFormat = TStorageFormat.V2; + } + if (bfColumns != null) { tSchema.setBloom_filter_fpp(bfFpp); } diff --git a/fe/src/main/jflex/sql_scanner.flex b/fe/src/main/jflex/sql_scanner.flex index 56a93684accef9..53322e7236ad07 100644 --- a/fe/src/main/jflex/sql_scanner.flex +++ b/fe/src/main/jflex/sql_scanner.flex @@ -160,6 +160,7 @@ import org.apache.doris.qe.SqlModeHelper; keywordMap.put("distinctpcsa", new Integer(SqlParserSymbols.KW_DISTINCTPCSA)); keywordMap.put("distributed", new Integer(SqlParserSymbols.KW_DISTRIBUTED)); keywordMap.put("distribution", new Integer(SqlParserSymbols.KW_DISTRIBUTION)); + keywordMap.put("dynamic", new Integer(SqlParserSymbols.KW_DYNAMIC)); keywordMap.put("buckets", new Integer(SqlParserSymbols.KW_BUCKETS)); keywordMap.put("div", new Integer(SqlParserSymbols.KW_DIV)); keywordMap.put("double", new Integer(SqlParserSymbols.KW_DOUBLE)); diff --git a/fe/src/test/java/org/apache/doris/alter/SchemaChangeJobV2Test.java b/fe/src/test/java/org/apache/doris/alter/SchemaChangeJobV2Test.java index 15ce167bb42872..6d6705e7c21d7c 100644 --- a/fe/src/test/java/org/apache/doris/alter/SchemaChangeJobV2Test.java +++ b/fe/src/test/java/org/apache/doris/alter/SchemaChangeJobV2Test.java @@ -27,11 +27,14 @@ import org.apache.doris.analysis.ColumnDef; import org.apache.doris.analysis.ColumnDef.DefaultValue; import org.apache.doris.analysis.ColumnPosition; +import org.apache.doris.analysis.ModifyTablePropertiesClause; import org.apache.doris.analysis.TypeDef; +import org.apache.doris.backup.CatalogMocker; import org.apache.doris.catalog.AggregateType; import org.apache.doris.catalog.Catalog; import org.apache.doris.catalog.CatalogTestUtil; import org.apache.doris.catalog.Database; +import org.apache.doris.catalog.DynamicPartitionProperty; import org.apache.doris.catalog.FakeCatalog; import org.apache.doris.catalog.FakeEditLog; import org.apache.doris.catalog.MaterializedIndex; @@ -46,6 +49,7 @@ import org.apache.doris.catalog.ScalarType; import org.apache.doris.catalog.Tablet; import org.apache.doris.common.AnalysisException; +import org.apache.doris.common.DdlException; import org.apache.doris.common.FeConstants; import org.apache.doris.common.FeMetaVersion; import org.apache.doris.common.UserException; @@ -58,10 +62,13 @@ import org.junit.Assert; import org.junit.Before; +import org.junit.Rule; import org.junit.Test; +import org.junit.rules.ExpectedException; import java.lang.reflect.InvocationTargetException; import java.util.ArrayList; +import java.util.HashMap; import java.util.List; import java.util.Map; @@ -80,6 +87,9 @@ public class SchemaChangeJobV2Test { false, AggregateType.MAX, false, new DefaultValue(true, "1"), ""); private static AddColumnClause addColumnClause = new AddColumnClause(newCol, new ColumnPosition("v"), null, null); + @Rule + public ExpectedException expectedEx = ExpectedException.none(); + @Before public void setUp() throws InstantiationException, IllegalAccessException, IllegalArgumentException, InvocationTargetException, NoSuchMethodException, SecurityException, AnalysisException { @@ -161,7 +171,7 @@ public void testSchemaChange1() throws Exception { Assert.assertEquals(2, testPartition.getMaterializedIndices(IndexExtState.ALL).size()); Assert.assertEquals(1, testPartition.getMaterializedIndices(IndexExtState.VISIBLE).size()); Assert.assertEquals(1, testPartition.getMaterializedIndices(IndexExtState.SHADOW).size()); - + // runWaitingTxnJob schemaChangeHandler.runAfterCatalogReady(); Assert.assertEquals(JobState.RUNNING, schemaChangeJob.getJobState()); @@ -187,9 +197,90 @@ public void testSchemaChange1() throws Exception { shadowReplica.updateVersionInfo(testPartition.getVisibleVersion(), testPartition.getVisibleVersionHash(), shadowReplica.getDataSize(), shadowReplica.getRowCount()); } } - + schemaChangeHandler.runAfterCatalogReady(); Assert.assertEquals(JobState.FINISHED, schemaChangeJob.getJobState()); } + @Test + public void testModifyDynamicPartitionNormal() throws UserException { + FakeCatalog.setCatalog(masterCatalog); + SchemaChangeHandler schemaChangeHandler = Catalog.getInstance().getSchemaChangeHandler(); + ArrayList alterClauses = new ArrayList<>(); + Map properties = new HashMap<>(); + properties.put(DynamicPartitionProperty.ENABLE, "true"); + properties.put(DynamicPartitionProperty.TIME_UNIT, "day"); + properties.put(DynamicPartitionProperty.END, "3"); + properties.put(DynamicPartitionProperty.PREFIX, "p"); + properties.put(DynamicPartitionProperty.BUCKETS, "30"); + alterClauses.add(new ModifyTablePropertiesClause(properties)); + Database db = CatalogMocker.mockDb(); + OlapTable olapTable = (OlapTable) db.getTable(CatalogMocker.TEST_TBL2_ID); + schemaChangeHandler.process(alterClauses, "default_cluster", db, olapTable); + Assert.assertTrue(olapTable.getTableProperty().getDynamicPartitionProperty().isExist()); + Assert.assertTrue(olapTable.getTableProperty().getDynamicPartitionProperty().getEnable()); + Assert.assertEquals("day", olapTable.getTableProperty().getDynamicPartitionProperty().getTimeUnit()); + Assert.assertEquals(3, olapTable.getTableProperty().getDynamicPartitionProperty().getEnd()); + Assert.assertEquals("p", olapTable.getTableProperty().getDynamicPartitionProperty().getPrefix()); + Assert.assertEquals(30, olapTable.getTableProperty().getDynamicPartitionProperty().getBuckets()); + + + // set dynamic_partition.enable = false + ArrayList tmpAlterClauses = new ArrayList<>(); + properties.put(DynamicPartitionProperty.ENABLE, "false"); + tmpAlterClauses.add(new ModifyTablePropertiesClause(properties)); + schemaChangeHandler.process(tmpAlterClauses, "default_cluster", db, olapTable); + Assert.assertFalse(olapTable.getTableProperty().getDynamicPartitionProperty().getEnable()); + // set dynamic_partition.time_unit = week + tmpAlterClauses = new ArrayList<>(); + properties.put(DynamicPartitionProperty.TIME_UNIT, "week"); + tmpAlterClauses.add(new ModifyTablePropertiesClause(properties)); + schemaChangeHandler.process(tmpAlterClauses, "default_cluster", db, olapTable); + Assert.assertEquals("week", olapTable.getTableProperty().getDynamicPartitionProperty().getTimeUnit()); + // set dynamic_partition.end = 10 + tmpAlterClauses = new ArrayList<>(); + properties.put(DynamicPartitionProperty.END, "10"); + tmpAlterClauses.add(new ModifyTablePropertiesClause(properties)); + schemaChangeHandler.process(tmpAlterClauses, "default_cluster", db, olapTable); + Assert.assertEquals(10, olapTable.getTableProperty().getDynamicPartitionProperty().getEnd()); + // set dynamic_partition.prefix = p1 + tmpAlterClauses = new ArrayList<>(); + properties.put(DynamicPartitionProperty.PREFIX, "p1"); + tmpAlterClauses.add(new ModifyTablePropertiesClause(properties)); + schemaChangeHandler.process(tmpAlterClauses, "default_cluster", db, olapTable); + Assert.assertEquals("p1", olapTable.getTableProperty().getDynamicPartitionProperty().getPrefix()); + // set dynamic_partition.buckets = 3 + tmpAlterClauses = new ArrayList<>(); + properties.put(DynamicPartitionProperty.BUCKETS, "3"); + tmpAlterClauses.add(new ModifyTablePropertiesClause(properties)); + schemaChangeHandler.process(tmpAlterClauses, "default_cluster", db, olapTable); + Assert.assertEquals(3, olapTable.getTableProperty().getDynamicPartitionProperty().getBuckets()); + } + + public void modifyDynamicPartitionWithoutTableProperty(String propertyKey, String propertyValue, String missPropertyKey) + throws UserException { + FakeCatalog.setCatalog(masterCatalog); + SchemaChangeHandler schemaChangeHandler = Catalog.getInstance().getSchemaChangeHandler(); + ArrayList alterClauses = new ArrayList<>(); + Map properties = new HashMap<>(); + properties.put(propertyKey, propertyValue); + alterClauses.add(new ModifyTablePropertiesClause(properties)); + + Database db = CatalogMocker.mockDb(); + OlapTable olapTable = (OlapTable) db.getTable(CatalogMocker.TEST_TBL2_ID); + + expectedEx.expect(DdlException.class); + expectedEx.expectMessage(String.format("Must assign %s properties", missPropertyKey)); + + schemaChangeHandler.process(alterClauses, "default_cluster", db, olapTable); + } + + @Test + public void testModifyDynamicPartitionWithoutTableProperty() throws UserException { + modifyDynamicPartitionWithoutTableProperty(DynamicPartitionProperty.ENABLE, "false", DynamicPartitionProperty.TIME_UNIT); + modifyDynamicPartitionWithoutTableProperty(DynamicPartitionProperty.TIME_UNIT, "day", DynamicPartitionProperty.ENABLE); + modifyDynamicPartitionWithoutTableProperty(DynamicPartitionProperty.END, "3", DynamicPartitionProperty.ENABLE); + modifyDynamicPartitionWithoutTableProperty(DynamicPartitionProperty.PREFIX, "p", DynamicPartitionProperty.ENABLE); + modifyDynamicPartitionWithoutTableProperty(DynamicPartitionProperty.BUCKETS, "30", DynamicPartitionProperty.ENABLE); + } } diff --git a/fe/src/test/java/org/apache/doris/analysis/CreateIndexClauseTest.java b/fe/src/test/java/org/apache/doris/analysis/CreateIndexClauseTest.java new file mode 100644 index 00000000000000..408ecc01d8fd0d --- /dev/null +++ b/fe/src/test/java/org/apache/doris/analysis/CreateIndexClauseTest.java @@ -0,0 +1,52 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.analysis; + +import org.apache.doris.common.AnalysisException; + +import com.google.common.collect.Lists; + +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.Test; + +public class CreateIndexClauseTest { + private static Analyzer analyzer; + + @BeforeClass + public static void setUp() { + analyzer = AccessTestUtil.fetchAdminAnalyzer(false); + } + + @Test + public void testNormal() throws AnalysisException { + CreateIndexClause clause = new CreateIndexClause(new TableName("db", "table"), new IndexDef("index1", + Lists.newArrayList("col1"), IndexDef.IndexType.BITMAP, "balabala"), false); + clause.analyze(analyzer); + Assert.assertEquals("CREATE INDEX index1 ON `db`.`table` (`col1`) USING BITMAP COMMENT 'balabala'", + clause.toSql()); + + } + + @Test(expected = AnalysisException.class) + public void testDuplIndex() throws AnalysisException { + CreateIndexClause clause = new CreateIndexClause(new TableName("db", "table"), null, false); + clause.analyze(analyzer); + + } +} \ No newline at end of file diff --git a/fe/src/test/java/org/apache/doris/analysis/DropIndexClauseTest.java b/fe/src/test/java/org/apache/doris/analysis/DropIndexClauseTest.java new file mode 100644 index 00000000000000..cb9177f4f1d943 --- /dev/null +++ b/fe/src/test/java/org/apache/doris/analysis/DropIndexClauseTest.java @@ -0,0 +1,48 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.analysis; + +import org.apache.doris.common.AnalysisException; +import org.apache.doris.common.UserException; + +import org.junit.Assert; +import org.junit.BeforeClass; +import org.junit.Test; + +public class DropIndexClauseTest { + + private static Analyzer analyzer; + + @BeforeClass + public static void setUp() { + analyzer = AccessTestUtil.fetchAdminAnalyzer(false); + } + + @Test + public void testNormal() throws UserException { + DropIndexClause clause = new DropIndexClause("index1", new TableName("db", "table"), false); + clause.analyze(analyzer); + Assert.assertEquals("DROP INDEX index1 ON `db`.`table`", clause.toSql()); + } + + @Test(expected = AnalysisException.class) + public void testNoIndex() throws UserException { + DropIndexClause clause = new DropIndexClause("", new TableName("db", "table"), false); + clause.analyze(analyzer); + } +} \ No newline at end of file diff --git a/fe/src/test/java/org/apache/doris/analysis/IndexDefTest.java b/fe/src/test/java/org/apache/doris/analysis/IndexDefTest.java new file mode 100644 index 00000000000000..8f510210197ed9 --- /dev/null +++ b/fe/src/test/java/org/apache/doris/analysis/IndexDefTest.java @@ -0,0 +1,72 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.analysis; + +import org.apache.doris.common.AnalysisException; + +import com.google.common.collect.Lists; + +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +public class IndexDefTest { + private IndexDef def; + + @Before + public void setUp() throws Exception { + def = new IndexDef("index1", Lists.newArrayList("col1"), IndexDef.IndexType.BITMAP, "balabala"); + } + + @Test + public void testAnalyzeNormal() throws AnalysisException { + def.analyze(); + } + + @Test + public void testAnalyzeExpection() throws AnalysisException { + try { + def = new IndexDef( + "index1xxxxxxxxxxxxxxxxxindex1xxxxxxxxxxxxxxxxxindex1xxxxxxxxxxxxxxxxxindex1xxxxxx" + + "xxxxxxxxxxxindex1xxxxxxxxxxxxxxxxxindex1xxxxxxxxxxxxxxxxxindex1xxxxxxxxxxxxxxxxxinde" + + "x1xxxxxxxxxxxxxxxxxindex1xxxxxxxxxxxxxxxxxindex1xxxxxxxxxxxxxxxxxindex1xxxxxxxxxxxxx" + + "xxxxindex1xxxxxxxxxxxxxxxxxindex1xxxxxxxxxxxxxxxxxindex1xxxxxxxxxxxxxxxxxindex1xxxxx" + + "xxxxxxxxxxxxindex1xxxxxxxxxxxxxxxxx", + Lists.newArrayList("col1"), IndexDef.IndexType.BITMAP, + "balabala"); + def.analyze(); + Assert.fail("No exception throws."); + } catch (AnalysisException e) { + Assert.assertTrue(e instanceof AnalysisException); + } + try { + def = new IndexDef("", Lists.newArrayList("col1"), IndexDef.IndexType.BITMAP, "balabala"); + def.analyze(); + Assert.fail("No exception throws."); + } catch (AnalysisException e) { + Assert.assertTrue(e instanceof AnalysisException); + } + } + + @Test + public void toSql() { + Assert.assertEquals("INDEX index1 (`col1`) USING BITMAP COMMENT 'balabala'", def.toSql()); + Assert.assertEquals("INDEX index1 ON table1 (`col1`) USING BITMAP COMMENT 'balabala'", + def.toSql("table1")); + } +} diff --git a/fe/src/test/java/org/apache/doris/analysis/ShowIndexStmtTest.java b/fe/src/test/java/org/apache/doris/analysis/ShowIndexStmtTest.java new file mode 100644 index 00000000000000..990fe54ab8f34d --- /dev/null +++ b/fe/src/test/java/org/apache/doris/analysis/ShowIndexStmtTest.java @@ -0,0 +1,66 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.analysis; + +import org.apache.doris.common.AnalysisException; +import org.apache.doris.common.UserException; +import org.apache.doris.mysql.privilege.MockedAuth; +import org.apache.doris.mysql.privilege.PaloAuth; +import org.apache.doris.qe.ConnectContext; + +import org.junit.Assert; +import org.junit.Before; +import org.junit.Test; + +import mockit.Mocked; + +public class ShowIndexStmtTest { + + private static Analyzer analyzer; + + @Mocked + private PaloAuth auth; + @Mocked + private ConnectContext ctx; + + @Before + public void setUp() { + analyzer = AccessTestUtil.fetchAdminAnalyzer(false); + MockedAuth.mockedAuth(auth); + MockedAuth.mockedConnectContext(ctx, "root", "192.168.1.1"); + } + + @Test + public void testNormal() throws UserException { + ShowIndexStmt stmt = new ShowIndexStmt("testDb", new TableName("", "testTbl")); + stmt.analyze(analyzer); + Assert.assertEquals("SHOW INDEX FROM `testDb`.`testTbl`", stmt.toSql()); + stmt = new ShowIndexStmt("", new TableName("", "testTbl")); + stmt.analyze(analyzer); + Assert.assertEquals("SHOW INDEX FROM `testDb`.`testTbl`", stmt.toSql()); + stmt = new ShowIndexStmt(null, new TableName("testDb", "testTbl")); + stmt.analyze(analyzer); + Assert.assertEquals("SHOW INDEX FROM `testDb`.`testTbl`", stmt.toSql()); + } + + @Test(expected = AnalysisException.class) + public void testNoTbl() throws UserException { + ShowIndexStmt stmt = new ShowIndexStmt("testDb", new TableName("", "")); + stmt.analyze(analyzer); + } +} \ No newline at end of file diff --git a/fe/src/test/java/org/apache/doris/analysis/ShowPartitionsStmtTest.java b/fe/src/test/java/org/apache/doris/analysis/ShowPartitionsStmtTest.java new file mode 100644 index 00000000000000..dead5d9b848f52 --- /dev/null +++ b/fe/src/test/java/org/apache/doris/analysis/ShowPartitionsStmtTest.java @@ -0,0 +1,100 @@ +package org.apache.doris.analysis; + +import org.apache.doris.catalog.Catalog; +import org.apache.doris.common.AnalysisException; +import org.apache.doris.common.UserException; +import org.apache.doris.system.SystemInfoService; +import org.easymock.EasyMock; +import org.junit.Assert; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.ExpectedException; +import org.junit.runner.RunWith; +import org.powermock.api.easymock.PowerMock; +import org.powermock.core.classloader.annotations.PowerMockIgnore; +import org.powermock.core.classloader.annotations.PrepareForTest; +import org.powermock.modules.junit4.PowerMockRunner; + +import java.util.Arrays; + + +@RunWith(PowerMockRunner.class) +@PowerMockIgnore({ "org.apache.log4j.*", "javax.management.*" }) +@PrepareForTest(Catalog.class) +public class ShowPartitionsStmtTest { + private Analyzer analyzer; + private Catalog catalog; + private SystemInfoService systemInfo; + + @Rule + public ExpectedException expectedEx = ExpectedException.none(); + + @Before + public void setUp() { + catalog = AccessTestUtil.fetchAdminCatalog(); + systemInfo = new SystemInfoService(); + + PowerMock.mockStatic(Catalog.class); + EasyMock.expect(Catalog.getInstance()).andReturn(catalog).anyTimes(); + EasyMock.expect(Catalog.getCurrentSystemInfo()).andReturn(systemInfo).anyTimes(); + EasyMock.expect(Catalog.getCurrentCatalog()).andReturn(catalog).anyTimes(); + PowerMock.replay(Catalog.class); + + analyzer = EasyMock.createMock(Analyzer.class); + EasyMock.expect(analyzer.getDefaultDb()).andReturn("testDb").anyTimes(); + EasyMock.expect(analyzer.getCatalog()).andReturn(catalog).anyTimes(); + EasyMock.expect(analyzer.getClusterName()).andReturn("testCluster").anyTimes(); + EasyMock.replay(analyzer); + } + + @Test + public void testNormal() throws UserException { + ShowPartitionsStmt stmt = new ShowPartitionsStmt(new TableName("testDb", "testTable"), null, null, null); + stmt.analyzeImpl(analyzer); + Assert.assertEquals("SHOW PARTITIONS FROM `testCluster:testDb`.`testTable`", stmt.toString()); + } + + @Test + public void testShowPartitionsStmtWithBinaryPredicate() throws UserException { + SlotRef slotRef = new SlotRef(null, "LastConsistencyCheckTime"); + StringLiteral stringLiteral = new StringLiteral("2019-12-22 10:22:11"); + BinaryPredicate binaryPredicate = new BinaryPredicate(BinaryPredicate.Operator.GT, slotRef, stringLiteral); + ShowPartitionsStmt stmt = new ShowPartitionsStmt(new TableName("testDb", "testTable"), binaryPredicate, null, null); + stmt.analyzeImpl(analyzer); + Assert.assertEquals("SHOW PARTITIONS FROM `testCluster:testDb`.`testTable` WHERE `LastConsistencyCheckTime` > '2019-12-22 10:22:11'", stmt.toString()); + } + + @Test + public void testShowPartitionsStmtWithLikePredicate() throws UserException { + SlotRef slotRef = new SlotRef(null, "PartitionName"); + StringLiteral stringLiteral = new StringLiteral("%p2019%"); + LikePredicate likePredicate = new LikePredicate(LikePredicate.Operator.LIKE, slotRef, stringLiteral); + ShowPartitionsStmt stmt = new ShowPartitionsStmt(new TableName("testDb", "testTable"), likePredicate, null, null); + stmt.analyzeImpl(analyzer); + Assert.assertEquals("SHOW PARTITIONS FROM `testCluster:testDb`.`testTable` WHERE `PartitionName` LIKE '%p2019%'", stmt.toString()); + } + + @Test + public void testShowParitionsStmtOrderByAndLimit() throws UserException { + SlotRef slotRef = new SlotRef(null, "PartitionId"); + OrderByElement orderByElement = new OrderByElement(slotRef, true, false); + LimitElement limitElement = new LimitElement(10); + ShowPartitionsStmt stmt = new ShowPartitionsStmt(new TableName("testDb", "testTable"), null, Arrays.asList(orderByElement), limitElement); + stmt.analyzeImpl(analyzer); + Assert.assertEquals("SHOW PARTITIONS FROM `testCluster:testDb`.`testTable` ORDER BY `PartitionId` ASC LIMIT 10", stmt.toString()); + } + + @Test + public void testUnsupportFilter() throws UserException { + SlotRef slotRef = new SlotRef(null, "DataSize"); + StringLiteral stringLiteral = new StringLiteral("3.2 GB"); + BinaryPredicate binaryPredicate = new BinaryPredicate(BinaryPredicate.Operator.EQ, slotRef, stringLiteral); + ShowPartitionsStmt stmt = new ShowPartitionsStmt(new TableName("testDb", "testTable"), binaryPredicate, null, null); + expectedEx.expect(AnalysisException.class); + expectedEx.expectMessage("Only the columns of PartitionId/PartitionName/" + + "State/Buckets/ReplicationNum/LastConsistencyCheckTime are supported."); + stmt.analyzeImpl(analyzer); + } + +} diff --git a/fe/src/test/java/org/apache/doris/catalog/DynamicPartitionTableTest.java b/fe/src/test/java/org/apache/doris/catalog/DynamicPartitionTableTest.java new file mode 100644 index 00000000000000..9f98a414d117da --- /dev/null +++ b/fe/src/test/java/org/apache/doris/catalog/DynamicPartitionTableTest.java @@ -0,0 +1,482 @@ +package org.apache.doris.catalog; + +import com.google.common.collect.Lists; +import mockit.Expectations; +import mockit.Injectable; +import mockit.Mock; +import mockit.MockUp; +import org.apache.doris.analysis.Analyzer; +import org.apache.doris.analysis.ColumnDef; +import org.apache.doris.analysis.CreateTableStmt; +import org.apache.doris.analysis.HashDistributionDesc; +import org.apache.doris.analysis.KeysDesc; +import org.apache.doris.analysis.PartitionKeyDesc; +import org.apache.doris.analysis.PartitionValue; +import org.apache.doris.analysis.RangePartitionDesc; +import org.apache.doris.analysis.SingleRangePartitionDesc; +import org.apache.doris.analysis.TableName; +import org.apache.doris.analysis.TypeDef; +import org.apache.doris.common.DdlException; +import org.apache.doris.common.UserException; +import org.apache.doris.mysql.privilege.PaloAuth; +import org.apache.doris.mysql.privilege.PrivPredicate; +import org.apache.doris.persist.EditLog; +import org.apache.doris.qe.ConnectContext; +import org.apache.doris.system.SystemInfoService; +import org.apache.doris.task.AgentBatchTask; +import org.junit.Before; +import org.junit.Rule; +import org.junit.Test; +import org.junit.rules.ExpectedException; + +import java.util.HashMap; +import java.util.LinkedList; +import java.util.List; +import java.util.Map; +import java.util.concurrent.CountDownLatch; +import java.util.concurrent.TimeUnit; + +public class DynamicPartitionTableTest { + private TableName dbTableName; + private String dbName = "testDb"; + private String tableName = "testTable"; + private String clusterName = "default"; + private List beIds = Lists.newArrayList(); + private List columnNames = Lists.newArrayList(); + private List columnDefs = Lists.newArrayList(); + + private Catalog catalog = Catalog.getInstance(); + private Database db = new Database(); + private Analyzer analyzer; + + private Map properties; + private List singleRangePartitionDescs; + + @Injectable + ConnectContext connectContext; + + @Rule + public ExpectedException expectedEx = ExpectedException.none(); + + @Before + public void setUp() throws Exception { + dbTableName = new TableName(dbName, tableName); + + beIds.add(1L); + beIds.add(2L); + beIds.add(3L); + + columnNames.add("key1"); + columnNames.add("key2"); + columnNames.add("key3"); + + columnDefs.add(new ColumnDef("key1", new TypeDef(ScalarType.createType(PrimitiveType.INT)))); + columnDefs.add(new ColumnDef("key2", new TypeDef(ScalarType.createType(PrimitiveType.INT)))); + columnDefs.add(new ColumnDef("key3", new TypeDef(ScalarType.createVarchar(10)))); + + analyzer = new Analyzer(catalog, connectContext); + + properties = new HashMap<>(); + properties.put(DynamicPartitionProperty.ENABLE, "true"); + properties.put(DynamicPartitionProperty.PREFIX, "p"); + properties.put(DynamicPartitionProperty.TIME_UNIT, "day"); + properties.put(DynamicPartitionProperty.END, "3"); + properties.put(DynamicPartitionProperty.BUCKETS, "30"); + + singleRangePartitionDescs = new LinkedList<>(); + singleRangePartitionDescs.add(new SingleRangePartitionDesc(false, "p1", + new PartitionKeyDesc(Lists.newArrayList(new PartitionValue("-128"))), null)); + + new MockUp() { + @Mock + void run() { + return; + } + }; + + new MockUp() { + @Mock + boolean await(long timeout, TimeUnit unit) { + return true; + } + }; + + new Expectations(analyzer, catalog) {{ + analyzer.getClusterName(); + minTimes = 0; + result = clusterName; + }}; + + dbTableName.analyze(analyzer); + } + + @Test + public void testNormal(@Injectable SystemInfoService systemInfoService, + @Injectable PaloAuth paloAuth, + @Injectable EditLog editLog) throws UserException { + new Expectations(catalog) { + { + catalog.getDb(dbTableName.getDb()); + minTimes = 0; + result = db; + + Catalog.getCurrentSystemInfo(); + minTimes = 0; + result = systemInfoService; + + systemInfoService.checkClusterCapacity(anyString); + minTimes = 0; + systemInfoService.seqChooseBackendIds(anyInt, true, true, anyString); + minTimes = 0; + result = beIds; + + catalog.getAuth(); + minTimes = 0; + result = paloAuth; + paloAuth.checkTblPriv((ConnectContext) any, anyString, anyString, PrivPredicate.CREATE); + minTimes = 0; + result = true; + + catalog.getEditLog(); + minTimes = 0; + result = editLog; + } + }; + + CreateTableStmt stmt = new CreateTableStmt(false, false, dbTableName, columnDefs, "olap", + new KeysDesc(KeysType.AGG_KEYS, columnNames), + new RangePartitionDesc(Lists.newArrayList("key1"), singleRangePartitionDescs), + new HashDistributionDesc(1, Lists.newArrayList("key1")), properties, null, ""); + stmt.analyze(analyzer); + + catalog.createTable(stmt); + } + + @Test + public void testMissEnable(@Injectable SystemInfoService systemInfoService, + @Injectable PaloAuth paloAuth, + @Injectable EditLog editLog) throws UserException { + new Expectations(catalog) { + { + catalog.getDb(dbTableName.getDb()); + minTimes = 0; + result = db; + + Catalog.getCurrentSystemInfo(); + minTimes = 0; + result = systemInfoService; + + systemInfoService.checkClusterCapacity(anyString); + minTimes = 0; + systemInfoService.seqChooseBackendIds(anyInt, true, true, anyString); + minTimes = 0; + result = beIds; + + catalog.getAuth(); + minTimes = 0; + result = paloAuth; + paloAuth.checkTblPriv((ConnectContext) any, anyString, anyString, PrivPredicate.CREATE); + minTimes = 0; + result = true; + + catalog.getEditLog(); + minTimes = 0; + result = editLog; + } + }; + + properties.remove(DynamicPartitionProperty.ENABLE); + + CreateTableStmt stmt = new CreateTableStmt(false, false, dbTableName, columnDefs, "olap", + new KeysDesc(KeysType.AGG_KEYS, columnNames), + new RangePartitionDesc(Lists.newArrayList("key1"), singleRangePartitionDescs), + new HashDistributionDesc(1, Lists.newArrayList("key1")), properties, null, ""); + stmt.analyze(analyzer); + + expectedEx.expect(DdlException.class); + expectedEx.expectMessage("Must assign dynamic_partition.enable properties"); + + catalog.createTable(stmt); + } + + @Test + public void testMissPrefix(@Injectable SystemInfoService systemInfoService, + @Injectable PaloAuth paloAuth, + @Injectable EditLog editLog) throws UserException { + new Expectations(catalog) { + { + catalog.getDb(dbTableName.getDb()); + minTimes = 0; + result = db; + + Catalog.getCurrentSystemInfo(); + minTimes = 0; + result = systemInfoService; + + systemInfoService.checkClusterCapacity(anyString); + minTimes = 0; + systemInfoService.seqChooseBackendIds(anyInt, true, true, anyString); + minTimes = 0; + result = beIds; + + catalog.getAuth(); + minTimes = 0; + result = paloAuth; + paloAuth.checkTblPriv((ConnectContext) any, anyString, anyString, PrivPredicate.CREATE); + minTimes = 0; + result = true; + + catalog.getEditLog(); + minTimes = 0; + result = editLog; + } + }; + + properties.remove(DynamicPartitionProperty.PREFIX); + + CreateTableStmt stmt = new CreateTableStmt(false, false, dbTableName, columnDefs, "olap", + new KeysDesc(KeysType.AGG_KEYS, columnNames), + new RangePartitionDesc(Lists.newArrayList("key1"), singleRangePartitionDescs), + new HashDistributionDesc(1, Lists.newArrayList("key1")), properties, null, ""); + stmt.analyze(analyzer); + + expectedEx.expect(DdlException.class); + expectedEx.expectMessage("Must assign dynamic_partition.prefix properties"); + + catalog.createTable(stmt); + } + + @Test + public void testMissTimeUnit(@Injectable SystemInfoService systemInfoService, + @Injectable PaloAuth paloAuth, + @Injectable EditLog editLog) throws UserException { + new Expectations(catalog) { + { + catalog.getDb(dbTableName.getDb()); + minTimes = 0; + result = db; + + Catalog.getCurrentSystemInfo(); + minTimes = 0; + result = systemInfoService; + + systemInfoService.checkClusterCapacity(anyString); + minTimes = 0; + systemInfoService.seqChooseBackendIds(anyInt, true, true, anyString); + minTimes = 0; + result = beIds; + + catalog.getAuth(); + minTimes = 0; + result = paloAuth; + paloAuth.checkTblPriv((ConnectContext) any, anyString, anyString, PrivPredicate.CREATE); + minTimes = 0; + result = true; + + catalog.getEditLog(); + minTimes = 0; + result = editLog; + } + }; + + properties.remove(DynamicPartitionProperty.TIME_UNIT); + + CreateTableStmt stmt = new CreateTableStmt(false, false, dbTableName, columnDefs, "olap", + new KeysDesc(KeysType.AGG_KEYS, columnNames), + new RangePartitionDesc(Lists.newArrayList("key1"), singleRangePartitionDescs), + new HashDistributionDesc(1, Lists.newArrayList("key1")), properties, null, ""); + stmt.analyze(analyzer); + + expectedEx.expect(DdlException.class); + expectedEx.expectMessage("Must assign dynamic_partition.time_unit properties"); + + catalog.createTable(stmt); + } + + @Test + public void testMissEnd(@Injectable SystemInfoService systemInfoService, + @Injectable PaloAuth paloAuth, + @Injectable EditLog editLog) throws UserException { + new Expectations(catalog) { + { + catalog.getDb(dbTableName.getDb()); + minTimes = 0; + result = db; + + Catalog.getCurrentSystemInfo(); + minTimes = 0; + result = systemInfoService; + + systemInfoService.checkClusterCapacity(anyString); + minTimes = 0; + systemInfoService.seqChooseBackendIds(anyInt, true, true, anyString); + minTimes = 0; + result = beIds; + + catalog.getAuth(); + minTimes = 0; + result = paloAuth; + paloAuth.checkTblPriv((ConnectContext) any, anyString, anyString, PrivPredicate.CREATE); + minTimes = 0; + result = true; + + catalog.getEditLog(); + minTimes = 0; + result = editLog; + } + }; + + properties.remove(DynamicPartitionProperty.END); + + CreateTableStmt stmt = new CreateTableStmt(false, false, dbTableName, columnDefs, "olap", + new KeysDesc(KeysType.AGG_KEYS, columnNames), + new RangePartitionDesc(Lists.newArrayList("key1"), singleRangePartitionDescs), + new HashDistributionDesc(1, Lists.newArrayList("key1")), properties, null, ""); + stmt.analyze(analyzer); + + expectedEx.expect(DdlException.class); + expectedEx.expectMessage("Must assign dynamic_partition.end properties"); + + catalog.createTable(stmt); + } + + @Test + public void testMissBuckets(@Injectable SystemInfoService systemInfoService, + @Injectable PaloAuth paloAuth, + @Injectable EditLog editLog) throws UserException { + new Expectations(catalog) { + { + catalog.getDb(dbTableName.getDb()); + minTimes = 0; + result = db; + + Catalog.getCurrentSystemInfo(); + minTimes = 0; + result = systemInfoService; + + systemInfoService.checkClusterCapacity(anyString); + minTimes = 0; + systemInfoService.seqChooseBackendIds(anyInt, true, true, anyString); + minTimes = 0; + result = beIds; + + catalog.getAuth(); + minTimes = 0; + result = paloAuth; + paloAuth.checkTblPriv((ConnectContext) any, anyString, anyString, PrivPredicate.CREATE); + minTimes = 0; + result = true; + + catalog.getEditLog(); + minTimes = 0; + result = editLog; + } + }; + + properties.remove(DynamicPartitionProperty.BUCKETS); + + CreateTableStmt stmt = new CreateTableStmt(false, false, dbTableName, columnDefs, "olap", + new KeysDesc(KeysType.AGG_KEYS, columnNames), + new RangePartitionDesc(Lists.newArrayList("key1"), singleRangePartitionDescs), + new HashDistributionDesc(1, Lists.newArrayList("key1")), properties, null, ""); + stmt.analyze(analyzer); + + expectedEx.expect(DdlException.class); + expectedEx.expectMessage("Must assign dynamic_partition.buckets properties"); + + catalog.createTable(stmt); + } + + @Test + public void testNotAllowed(@Injectable SystemInfoService systemInfoService, + @Injectable PaloAuth paloAuth, + @Injectable EditLog editLog) throws UserException { + new Expectations(catalog) { + { + catalog.getDb(dbTableName.getDb()); + minTimes = 0; + result = db; + + Catalog.getCurrentSystemInfo(); + minTimes = 0; + result = systemInfoService; + + systemInfoService.checkClusterCapacity(anyString); + minTimes = 0; + systemInfoService.seqChooseBackendIds(anyInt, true, true, anyString); + minTimes = 0; + result = beIds; + + catalog.getAuth(); + minTimes = 0; + result = paloAuth; + paloAuth.checkTblPriv((ConnectContext) any, anyString, anyString, PrivPredicate.CREATE); + minTimes = 0; + result = true; + + catalog.getEditLog(); + minTimes = 0; + result = editLog; + } + }; + + CreateTableStmt stmt = new CreateTableStmt(false, false, dbTableName, columnDefs, "olap", + new KeysDesc(KeysType.AGG_KEYS, columnNames), null, + new HashDistributionDesc(1, Lists.newArrayList("key1")), properties, null, ""); + stmt.analyze(analyzer); + + expectedEx.expect(DdlException.class); + expectedEx.expectMessage("Only support dynamic partition properties on range partition table"); + + catalog.createTable(stmt); + } + + @Test + public void testNotAllowedInMultiPartitions(@Injectable SystemInfoService systemInfoService, + @Injectable PaloAuth paloAuth, + @Injectable EditLog editLog) throws UserException { + new Expectations(catalog) { + { + catalog.getDb(dbTableName.getDb()); + minTimes = 0; + result = db; + + Catalog.getCurrentSystemInfo(); + minTimes = 0; + result = systemInfoService; + + systemInfoService.checkClusterCapacity(anyString); + minTimes = 0; + systemInfoService.seqChooseBackendIds(anyInt, true, true, anyString); + minTimes = 0; + result = beIds; + + catalog.getAuth(); + minTimes = 0; + result = paloAuth; + paloAuth.checkTblPriv((ConnectContext) any, anyString, anyString, PrivPredicate.CREATE); + minTimes = 0; + result = true; + + catalog.getEditLog(); + minTimes = 0; + result = editLog; + } + }; + + List rangePartitionDescs = new LinkedList<>(); + rangePartitionDescs.add(new SingleRangePartitionDesc(false, "p1", + new PartitionKeyDesc(Lists.newArrayList(new PartitionValue("-128"), new PartitionValue("100"))), null)); + + CreateTableStmt stmt = new CreateTableStmt(false, false, dbTableName, columnDefs, "olap", + new KeysDesc(KeysType.AGG_KEYS, columnNames), + new RangePartitionDesc(Lists.newArrayList("key1", "key2"), singleRangePartitionDescs), + new HashDistributionDesc(1, Lists.newArrayList("key1")), properties, null, ""); + stmt.analyze(analyzer); + + expectedEx.expect(DdlException.class); + expectedEx.expectMessage("Dynamic partition only support single-column range partition"); + + catalog.createTable(stmt); + } +} \ No newline at end of file diff --git a/fe/src/test/java/org/apache/doris/catalog/FakeEditLog.java b/fe/src/test/java/org/apache/doris/catalog/FakeEditLog.java index 4d45afcb644665..3c179f16c21b8f 100644 --- a/fe/src/test/java/org/apache/doris/catalog/FakeEditLog.java +++ b/fe/src/test/java/org/apache/doris/catalog/FakeEditLog.java @@ -22,6 +22,7 @@ import org.apache.doris.alter.SchemaChangeJob; import org.apache.doris.cluster.Cluster; import org.apache.doris.persist.EditLog; +import org.apache.doris.persist.ModifyDynamicPartitionInfo; import org.apache.doris.persist.RoutineLoadOperation; import org.apache.doris.transaction.TransactionState; @@ -97,6 +98,11 @@ public void logAlterJob(AlterJobV2 alterJob) { } + @Mock + public void logDynamicPartition(ModifyDynamicPartitionInfo info) { + + } + public TransactionState getTransaction(long transactionId) { return allTransactionState.get(transactionId); } diff --git a/fe/src/test/java/org/apache/doris/catalog/OlapTableTest.java b/fe/src/test/java/org/apache/doris/catalog/OlapTableTest.java index da89f846239cc5..0a0a5a7bc4bc39 100644 --- a/fe/src/test/java/org/apache/doris/catalog/OlapTableTest.java +++ b/fe/src/test/java/org/apache/doris/catalog/OlapTableTest.java @@ -20,11 +20,15 @@ import mockit.Expectations; import mockit.Mock; import mockit.MockUp; + +import org.apache.doris.analysis.IndexDef; import org.apache.doris.catalog.Table.TableType; import org.apache.doris.common.FeConstants; import org.apache.doris.common.io.FastByteArrayOutputStream; import org.apache.doris.common.util.UnitTestUtil; +import com.google.common.collect.Lists; + import org.junit.Test; import java.io.DataInputStream; @@ -52,6 +56,8 @@ int getCurrentCatalogJournalVersion() { continue; } OlapTable tbl = (OlapTable) table; + tbl.setIndexes(Lists.newArrayList(new Index("index", Lists.newArrayList("col"), IndexDef.IndexType.BITMAP + , "xxxxxx"))); System.out.println("orig table id: " + tbl.getId()); FastByteArrayOutputStream byteArrayOutputStream = new FastByteArrayOutputStream(); diff --git a/fe/src/test/java/org/apache/doris/catalog/TablePropertyTest.java b/fe/src/test/java/org/apache/doris/catalog/TablePropertyTest.java new file mode 100644 index 00000000000000..ca69bd584ff691 --- /dev/null +++ b/fe/src/test/java/org/apache/doris/catalog/TablePropertyTest.java @@ -0,0 +1,74 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.catalog; + + +import org.junit.After; +import org.junit.Assert; +import org.junit.Test; + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.util.HashMap; + +public class TablePropertyTest { + private static String fileName = "./TablePropertyTest"; + + @After + public void tearDown() { + File file = new File(fileName); + file.delete(); + } + + @Test + public void testNormal() throws IOException { + // 1. Write objects to file + File file = new File(fileName); + file.createNewFile(); + DataOutputStream out = new DataOutputStream(new FileOutputStream(file)); + + HashMap properties = new HashMap<>(); + properties.put(DynamicPartitionProperty.ENABLE, "true"); + properties.put(DynamicPartitionProperty.TIME_UNIT, "day"); + properties.put(DynamicPartitionProperty.END, "3"); + properties.put(DynamicPartitionProperty.PREFIX, "p"); + properties.put(DynamicPartitionProperty.BUCKETS, "30"); + properties.put("otherProperty", "unknownProperty"); + TableProperty tableProperty = new TableProperty(properties); + tableProperty.write(out); + out.flush(); + out.close(); + + // 2. Read objects from file + DataInputStream in = new DataInputStream(new FileInputStream(file)); + TableProperty readTableProperty = TableProperty.read(in); + DynamicPartitionProperty readDynamicPartitionProperty = readTableProperty.getDynamicPartitionProperty(); + DynamicPartitionProperty dynamicPartitionProperty = new DynamicPartitionProperty(properties); + Assert.assertEquals(readTableProperty.getProperties(), properties); + Assert.assertEquals(readDynamicPartitionProperty.getEnable(), dynamicPartitionProperty.getEnable()); + Assert.assertEquals(readDynamicPartitionProperty.getBuckets(), dynamicPartitionProperty.getBuckets()); + Assert.assertEquals(readDynamicPartitionProperty.getPrefix(), dynamicPartitionProperty.getPrefix()); + Assert.assertEquals(readDynamicPartitionProperty.getEnd(), dynamicPartitionProperty.getEnd()); + Assert.assertEquals(readDynamicPartitionProperty.getTimeUnit(), dynamicPartitionProperty.getTimeUnit()); + in.close(); + } +} diff --git a/fe/src/test/java/org/apache/doris/persist/ModifyDynamicPartitionInfoTest.java b/fe/src/test/java/org/apache/doris/persist/ModifyDynamicPartitionInfoTest.java new file mode 100644 index 00000000000000..27406e231e64b2 --- /dev/null +++ b/fe/src/test/java/org/apache/doris/persist/ModifyDynamicPartitionInfoTest.java @@ -0,0 +1,68 @@ +// Licensed to the Apache Software Foundation (ASF) under one +// or more contributor license agreements. See the NOTICE file +// distributed with this work for additional information +// regarding copyright ownership. The ASF licenses this file +// to you under the Apache License, Version 2.0 (the +// "License"); you may not use this file except in compliance +// with the License. You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, +// software distributed under the License is distributed on an +// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY +// KIND, either express or implied. See the License for the +// specific language governing permissions and limitations +// under the License. + +package org.apache.doris.persist; + +import org.apache.doris.catalog.DynamicPartitionProperty; +import org.junit.After; +import org.junit.Assert; +import org.junit.Test; + +import java.io.DataInputStream; +import java.io.DataOutputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.util.HashMap; + +public class ModifyDynamicPartitionInfoTest { + private String fileName = "./ModifyDynamicPartitionInfoTest"; + + @After + public void tearDown() { + File file = new File(fileName); + file.delete(); + } + + @Test + public void testNormal() throws IOException { + // 1. Write objects to file + File file = new File(fileName); + file.createNewFile(); + DataOutputStream out = new DataOutputStream(new FileOutputStream(file)); + + HashMap properties = new HashMap<>(); + properties.put(DynamicPartitionProperty.ENABLE, "true"); + properties.put(DynamicPartitionProperty.TIME_UNIT, "day"); + properties.put(DynamicPartitionProperty.END, "3"); + properties.put(DynamicPartitionProperty.PREFIX, "p"); + properties.put(DynamicPartitionProperty.BUCKETS, "30"); + ModifyDynamicPartitionInfo modifyDynamicPartitionInfo = new ModifyDynamicPartitionInfo(100L, 200L, properties); + modifyDynamicPartitionInfo.write(out); + out.flush(); + out.close(); + + // 2. Read objects from file + DataInputStream in = new DataInputStream(new FileInputStream(file)); + ModifyDynamicPartitionInfo readModifyDynamicPartitionInfo = ModifyDynamicPartitionInfo.read(in); + Assert.assertEquals(readModifyDynamicPartitionInfo.getDbId(), 100L); + Assert.assertEquals(readModifyDynamicPartitionInfo.getTableId(), 200L); + Assert.assertEquals(readModifyDynamicPartitionInfo.getProperties(), properties); + in.close(); + } +} diff --git a/fe/src/test/java/org/apache/doris/task/AgentTaskTest.java b/fe/src/test/java/org/apache/doris/task/AgentTaskTest.java index b761e6f812eb3f..d29a6ae4249ecc 100644 --- a/fe/src/test/java/org/apache/doris/task/AgentTaskTest.java +++ b/fe/src/test/java/org/apache/doris/task/AgentTaskTest.java @@ -112,7 +112,7 @@ public void setUp() throws AnalysisException { indexId1, tabletId1, shortKeyNum, schemaHash1, version, versionHash, KeysType.AGG_KEYS, storageType, TStorageMedium.SSD, - columns, null, 0, latch); + columns, null, 0, latch, null); // drop dropTask = new DropReplicaTask(backendId1, tabletId1, schemaHash1); diff --git a/gensrc/proto/AgentService.thrift b/gensrc/proto/AgentService.thrift deleted file mode 100644 index e69de29bb2d1d6..00000000000000 diff --git a/gensrc/proto/olap_common.proto b/gensrc/proto/olap_common.proto index 3ac7c53a810dab..846b25ad50f6cf 100644 --- a/gensrc/proto/olap_common.proto +++ b/gensrc/proto/olap_common.proto @@ -38,6 +38,8 @@ message ColumnMessage { optional bool is_root_column = 14 [default=false]; // not used // is bloom filter column optional bool is_bf_column = 15 [default=false]; // ColumnPB.is_bf_column + // is bitmap index column + optional bool has_bitmap_index = 16 [default=false]; // ColumnPB.has_bitmap_index } enum CompressKind { diff --git a/gensrc/proto/olap_file.proto b/gensrc/proto/olap_file.proto index 4b72f5258eeae2..556548505398b4 100644 --- a/gensrc/proto/olap_file.proto +++ b/gensrc/proto/olap_file.proto @@ -264,6 +264,7 @@ message ColumnPB { optional bool is_bf_column = 12; // ColumnMessage.is_bf_column optional int32 referenced_column_id = 13; // optional string referenced_column = 14; // ColumnMessage.referenced_column? + optional bool has_bitmap_index = 15 [default=false]; // ColumnMessage.has_bitmap_index } diff --git a/gensrc/proto/segment_v2.proto b/gensrc/proto/segment_v2.proto index 19a3980c61e0f4..ad9cb27682e3fd 100644 --- a/gensrc/proto/segment_v2.proto +++ b/gensrc/proto/segment_v2.proto @@ -31,7 +31,7 @@ message ColumnSchemaPB { optional uint32 frac = 10 [default = 9]; optional bool is_nullable = 11 [default=false]; optional bool is_bf_column = 15 [default=false]; // is bloom filter indexed column - optional bool is_bitmap_column = 16 [default=false]; + optional bool has_bitmap_index = 16 [default=false]; } // page position info diff --git a/gensrc/script/doris_builtins_functions.py b/gensrc/script/doris_builtins_functions.py index 3000f3db8e4c43..c73edcd2b01a6e 100755 --- a/gensrc/script/doris_builtins_functions.py +++ b/gensrc/script/doris_builtins_functions.py @@ -610,7 +610,10 @@ '_ZN5doris15BitmapFunctions12bitmap_countEPN9doris_udf15FunctionContextERKNS1_9StringValE'], [['bitmap_empty'], 'VARCHAR', [], '_ZN5doris15BitmapFunctions12bitmap_emptyEPN9doris_udf15FunctionContextE'], - + [['bitmap_or'], 'VARCHAR', ['VARCHAR','VARCHAR'], + '_ZN5doris15BitmapFunctions9bitmap_orEPN9doris_udf15FunctionContextERKNS1_9StringValES6_'], + [['bitmap_and'], 'VARCHAR', ['VARCHAR','VARCHAR'], + '_ZN5doris15BitmapFunctions10bitmap_andEPN9doris_udf15FunctionContextERKNS1_9StringValES6_'], # aes and base64 function [['aes_encrypt'], 'VARCHAR', ['VARCHAR', 'VARCHAR'], diff --git a/gensrc/thrift/AgentService.thrift b/gensrc/thrift/AgentService.thrift index 8c5b0388eca52f..d1a00578423666 100644 --- a/gensrc/thrift/AgentService.thrift +++ b/gensrc/thrift/AgentService.thrift @@ -21,6 +21,7 @@ namespace java org.apache.doris.thrift include "Status.thrift" include "Types.thrift" include "PaloInternalService.thrift" +include "Descriptors.thrift" struct TColumn { 1: required string column_name @@ -39,6 +40,7 @@ struct TTabletSchema { 4: required Types.TStorageType storage_type 5: required list columns 6: optional double bloom_filter_fpp + 7: optional list indexes } // this enum stands for different storage format in src_backends diff --git a/gensrc/thrift/Descriptors.thrift b/gensrc/thrift/Descriptors.thrift index 2d10dbd8775657..734b11f7cf280b 100644 --- a/gensrc/thrift/Descriptors.thrift +++ b/gensrc/thrift/Descriptors.thrift @@ -99,6 +99,10 @@ enum THdfsCompression { SNAPPY_BLOCKED // Used by sequence and rc files but not stored in the metadata. } +enum TIndexType { + BITMAP +} + // Mapping from names defined by Avro to the enum. // We permit gzip and bzip2 in addition. const map COMPRESSION_MAP = { @@ -166,6 +170,13 @@ struct TOlapTableSchemaParam { 6: required list indexes } +struct TOlapTableIndex { + 1: optional string index_name + 2: optional list columns + 3: optional TIndexType index_type + 4: optional string comment +} + struct TTabletLocation { 1: required i64 tablet_id 2: required list node_ids diff --git a/run-ut.sh b/run-ut.sh index f0797033df63c5..bba2b6607e81d8 100755 --- a/run-ut.sh +++ b/run-ut.sh @@ -164,6 +164,7 @@ ${DORIS_TEST_BINARY_DIR}/util/counter_cond_variable_test ${DORIS_TEST_BINARY_DIR}/util/bit_stream_utils_test ${DORIS_TEST_BINARY_DIR}/util/frame_of_reference_coding_test ${DORIS_TEST_BINARY_DIR}/util/zip_util_test +${DORIS_TEST_BINARY_DIR}/util/utf8_check_test # Running common Unittest ${DORIS_TEST_BINARY_DIR}/common/resource_tls_test diff --git a/thirdparty/build-thirdparty.sh b/thirdparty/build-thirdparty.sh index 92094fa83ad1a3..26e621edf7e20a 100755 --- a/thirdparty/build-thirdparty.sh +++ b/thirdparty/build-thirdparty.sh @@ -352,7 +352,8 @@ build_snappy() { cp $TP_INCLUDE_DIR/snappy/snappy-c.h $TP_INCLUDE_DIR/snappy-c.h && \ cp $TP_INCLUDE_DIR/snappy/snappy-sinksource.h $TP_INCLUDE_DIR/snappy-sinksource.h && \ cp $TP_INCLUDE_DIR/snappy/snappy-stubs-public.h $TP_INCLUDE_DIR/snappy-stubs-public.h && \ - cp $TP_INCLUDE_DIR/snappy/snappy.h $TP_INCLUDE_DIR/snappy.h + cp $TP_INCLUDE_DIR/snappy/snappy.h $TP_INCLUDE_DIR/snappy.h && \ + cp $TP_INSTALL_DIR/lib/libsnappy.a $TP_INSTALL_DIR/libsnappy.a } # gperftools @@ -425,7 +426,7 @@ build_curl() { LDFLAGS="-L${TP_LIB_DIR}" LIBS="-lcrypto -lssl -lcrypto -ldl" \ CFLAGS="-fPIC" \ ./configure --prefix=$TP_INSTALL_DIR --disable-shared --enable-static \ - --with-ssl=${TP_INSTALL_DIR} --without-libidn2 --disable-ldap --enable-ipv6 + --without-librtmp --with-ssl=${TP_INSTALL_DIR} --without-libidn2 --disable-ldap --enable-ipv6 make -j$PARALLEL && make install } @@ -541,6 +542,20 @@ build_librdkafka() { make -j$PARALLEL && make install } +# flatbuffers +build_flatbuffers() { + check_if_source_exist $FLATBUFFERS_SOURCE + cd $TP_SOURCE_DIR/$FLATBUFFERS_SOURCE + mkdir build -p && cd build + rm -rf CMakeCache.txt CMakeFiles/ + cmake .. + CXXFLAGS="-fPIC" make -j$PARALLEL + cp flatc ../../../installed/bin/flatc + cp -r ../include/flatbuffers ../../../installed/include/flatbuffers + cp libflatbuffers.a ../../../installed/lib/libflatbuffers.a +} + +# arrow build_arrow() { check_if_source_exist $ARROW_SOURCE cd $TP_SOURCE_DIR/$ARROW_SOURCE/cpp && mkdir -p release && cd release @@ -703,6 +718,7 @@ build_leveldb build_brpc build_rocksdb build_librdkafka +build_flatbuffers build_arrow build_s2 build_bitshuffle diff --git a/thirdparty/vars.sh b/thirdparty/vars.sh index 9e67640f553ac3..63c2a4a0c058b4 100644 --- a/thirdparty/vars.sh +++ b/thirdparty/vars.sh @@ -231,7 +231,7 @@ BROTLI_MD5SUM="7b6edd4f2128f22794d0ca28c53898a5" # flatbuffers FLATBUFFERS_DOWNLOAD="https://github.com/google/flatbuffers/archive/v1.10.0.tar.gz" FLATBUFFERS_NAME=flatbuffers-v1.10.0.tar.gz -FLATBUFFERS_SOURCE=flatbuffers-v1.10.0 +FLATBUFFERS_SOURCE=flatbuffers-1.10.0 FLATBUFFERS_MD5SUM="f7d19a3f021d93422b0bc287d7148cd2" # arrow