Replies: 7 comments 5 replies
-
|
Are you trying to count certain symbols? |
Beta Was this translation helpful? Give feedback.
-
|
If you just want to count symbols, this is the best I can offer you now: https://godbolt.org/z/1rejYz7rM |
Beta Was this translation helpful? Give feedback.
-
|
obviously I want to get the positions and types of the block elements. Can I compare this to home? Consider a simple structure that will take the values blockType, posStart, *posEnd. And then it should be append into std::vector . |
Beta Was this translation helpful? Give feedback.
-
#pragma once
#include <cassert>
#include <cstring>
#include <deque>
#include <eve/eve.hpp>
#include <vector>
#include "helper.hpp"
// #include "src/PerformanceLogger.hpp"
namespace tokenizer {
enum class TokenType {
kNull,
kTrue,
kFalse,
kNumber,
kString,
kArray,
kArrayEnd,
kObject,
kObjectEnd,
kComma,
kColon,
kError,
};
using wide_u8 = eve::wide<std::uint8_t, eve::fixed<8>>;
using wide_u16 = eve::wide<std::uint8_t, eve::fixed<16>>;
static const wide_u8 kSpaceCharList{' ', '\t', '\n', '\r',
'\f', '\b', ':', ','};
EVE_FORCEINLINE bool isWhiteSpace(const std::uint8_t& c) {
// wide_u16 mask{c};
return eve::any(kSpaceCharList == c);
}
/*
JSON String Escape Table
*/
static const wide_u8 kEscapeTable{' ', '\t', '\n', '\r',
'\f', '\b', '\v', '\a'};
EVE_FORCEINLINE bool isEscape(const std::uint8_t& c) {
wide_u8 fill_c{c};
// eve::wide<std::uint8_t> mask{c};
return eve::any(kEscapeTable == c);
};
// JSON Number Table
/*
'0': 0x30, '1': 0x31, '2': 0x32, '3': 0x33, '4': 0x34, '5': 0x35, '6': 0x36,
'7': 0x37, '8': 0x38, '9': 0x39,
'-': 0x2d, '+': 0x2b, '.': 0x2e, 'e': 0x65, 'E': 0x45,
total count is 15
*/
static const wide_u16 kNumericTable{0x30, 0x31, 0x32, 0x33, 0x34, 0x35,
0x36, 0x37, 0x38, 0x39, 0x2d, 0x2b,
0x2e, 0x65, 0x45, 0x00};
/*
Numeric Table
*/
EVE_FORCEINLINE bool isNumeric(const std::uint8_t& c) {
// wide_u16 fill_c{c, c, c, c, c, c, c, c, c, c, c, c, c, c, c, c};
return eve::any(kNumericTable == c);
}
struct Token {
Token(TokenType type, int from) : type(type), from(from) {}
Token(TokenType type, int from, int* to) : type(type), from(from), to(to) {}
TokenType type;
int from;
int* to = nullptr;
};
// number,string,true,false and null position state
struct State {
TokenType type;
int from;
int to;
};
// static constexpr uint32_t kFalseBin = 0x65736c61;
// static constexpr uint32_t kTrueBin = 0x65757274;
// static constexpr uint32_t kNullBin = 0x6c6c756e;
static auto& strBitset = StringBitset::getInstance();
const EVE_FORCEINLINE std::vector<Token>* parse(std::uint8_t* data,
std::size_t size) {
auto* tokenList = new std::vector<Token>();
// tokenList->reserve(4096);
bool isString = false;
bool isNumber = false;
int advance = 0;
std::size_t i = 0;
std::vector<std::size_t> blockList;
for (; i < size; i++) {
// if (advance > 0) {
// advance -= 1;
// continue;
// }
// if (isNumber) {
// // auto c = data[i];
// // if ((c >= 0x30 && c <= 0x39) || c == 0x2e || c == 0x2d || c == 0x2b ||
// // c == 0x45 || c == 0x65) {
// if (strBitset.hasNumeric(data[i])) {
// continue;
// }
// isNumber = false;
// // tokenList->emplace_back(TokenType::kNumber, primitivePosition,
// // new int(i - primitivePosition));
// // primitivePosition = 0;
// // Console.timeEnd("Number Parsing");
// continue;
// }
// if (isString) {
// if (data[i] == '"') {
// isString = false;
// // Console.timeEnd("String Parsing");
// continue;
// }
// if (strBitset.hasEscape(data[i])) {
// continue;
// }
// continue;
// }
// SIMD vektöründe beyaz boşluk kontrolü yapılıyor
if (strBitset.hasWhitespace(data[i])) {
// current = data[++i];
continue;
}
switch (data[i]) {
case '{':
// Console.time("Object Start");
tokenList->emplace_back(TokenType::kObject, i);
// Console.timeEnd("Object Start");
break;
case '}':
// Console.time("Object End");
tokenList->emplace_back(TokenType::kObjectEnd, i);
// Console.timeEnd("Object End");
break;
case '[':
// Console.time("Array Start");
tokenList->emplace_back(TokenType::kArray, i);
// Console.timeEnd("Array Start");
break;
case ']':
// Console.time("Array End");
tokenList->emplace_back(TokenType::kArrayEnd, i);
// Console.timeEnd("Array End");
break;
case '"':
// Console.time("String Parsing");
// isString = !isString;
// primitivePosition = i + 1;
while (data[++i] != '"') {
if (strBitset.hasEscape(data[i])) {
i += 1;
}
}
// tokenList->emplace_back(TokenType::kString, primitivePosition,
// new int(i - primitivePosition - 1));
// primitivePosition = 0;
break;
case 't':
tokenList->emplace_back(TokenType::kTrue, i);
i += 3;
break;
case 'f':
tokenList->emplace_back(TokenType::kFalse, i);
i += 4;
break;
case 'n':
// get first 4 char
tokenList->emplace_back(TokenType::kNull, i);
i += 3;
break;
default:
// Console.time("Number Parsing");
if (strBitset.hasNumeric(data[i])) {
// isNumber = true;
// primitivePosition = i;
while (strBitset.hasNumeric(data[++i])) {
i += 1;
}
// tokenList->emplace_back(TokenType::kString, primitivePosition,
// new int(i - primitivePosition));
// primitivePosition = 0;
}
break;
}
}
// tokenList->shrink_to_fit();
return tokenList;
}
}; // namespace tokenizerThis build is currently running at 600mb/s. I want to improve performance with parallel processes with Eve. |
Beta Was this translation helpful? Give feedback.
-
|
I don't think it's as obvious if you are outside of context. I would also suggest to see if maybe SIMD JSON is the solution you are looking for, it would be very difficult for you to compete with them |
Beta Was this translation helpful? Give feedback.
-
|
I will have a look at the weekend, there is a lot there. So far I'd say, best I can do for you would be to vectorise the splitting into substrings like this: https://github.com/jfalcou/eve/blob/main/examples/algorithms/writing_new/collect_indexes__complicated_real_example.cpp But I don't know if it would be faster then your switch. |
Beta Was this translation helpful? Give feedback.
-
|
After looking at this. a) I suspect this is fairly difficult. I'd suggest, using the example I gave you: See if this is faster then what you have |
Beta Was this translation helpful? Give feedback.
Uh oh!
There was an error while loading. Please reload this page.
Uh oh!
There was an error while loading. Please reload this page.
-
Beta Was this translation helpful? Give feedback.
All reactions