1 /** 2 Utility functions for string processing 3 4 Copyright: © 2012-2014 Sönke Ludwig 5 License: Subject to the terms of the MIT license, as written in the included LICENSE.txt file. 6 Authors: Sönke Ludwig 7 */ 8 module vibe.utils..string; 9 10 public import std.string; 11 12 import vibe.utils.array; 13 import vibe.internal.utilallocator; 14 15 import std.algorithm; 16 import std.array; 17 import std.ascii; 18 import std.format; 19 import std.typecons : Yes; 20 import std.uni; 21 import std.utf; 22 import core.exception; 23 24 25 /** 26 Takes a string with possibly invalid UTF8 sequences and outputs a valid UTF8 string as near to 27 the original as possible. 28 */ 29 string sanitizeUTF8(in ubyte[] str) 30 @safe pure { 31 import std.utf; 32 auto ret = appender!string(); 33 ret.reserve(str.length); 34 35 size_t i = 0; 36 while (i < str.length) { 37 dchar ch = str[i]; 38 try ch = std.utf.decode(cast(const(char[]))str, i); 39 catch( UTFException ){ i++; } 40 //catch( AssertError ){ i++; } 41 char[4] dst; 42 auto len = std.utf.encode(dst, ch); 43 ret.put(dst[0 .. len]); 44 } 45 46 return ret.data; 47 } 48 49 /** 50 Strips the byte order mark of an UTF8 encoded string. 51 This is useful when the string is coming from a file. 52 */ 53 inout(char)[] stripUTF8Bom(inout(char)[] str) 54 @safe pure nothrow { 55 if (str.length >= 3 && str[0 .. 3] == [0xEF, 0xBB, 0xBF]) 56 return str[3 ..$]; 57 return str; 58 } 59 60 61 /** 62 Checks if all characters in 'str' are contained in 'chars'. 63 */ 64 bool allOf(const(char)[] str, const(char)[] chars) 65 @safe pure { 66 foreach (dchar ch; str) 67 if (!chars.canFind(ch)) 68 return false; 69 return true; 70 } 71 72 /** 73 Checks if any character in 'str' is contained in 'chars'. 74 */ 75 bool anyOf(const(char)[] str, const(char)[] chars) 76 @safe pure { 77 foreach (ch; str) 78 if (chars.canFind(ch)) 79 return true; 80 return false; 81 } 82 83 84 /// ASCII whitespace trimming (space and tab) 85 inout(char)[] stripLeftA(inout(char)[] s) 86 @safe pure nothrow { 87 while (s.length > 0 && (s[0] == ' ' || s[0] == '\t')) 88 s = s[1 .. $]; 89 return s; 90 } 91 92 /// ASCII whitespace trimming (space and tab) 93 inout(char)[] stripRightA(inout(char)[] s) 94 @safe pure nothrow { 95 while (s.length > 0 && (s[$-1] == ' ' || s[$-1] == '\t')) 96 s = s[0 .. $-1]; 97 return s; 98 } 99 100 /// ASCII whitespace trimming (space and tab) 101 inout(char)[] stripA(inout(char)[] s) 102 @safe pure nothrow { 103 return stripLeftA(stripRightA(s)); 104 } 105 106 /// Finds the first occurence of any of the characters in `chars` 107 sizediff_t indexOfAny(const(char)[] str, const(char)[] chars) 108 @safe pure { 109 foreach (i, char ch; str) 110 if (chars.canFind(ch)) 111 return i; 112 return -1; 113 } 114 alias countUntilAny = indexOfAny; 115 116 /** 117 Finds the closing bracket (works with any of '[', '$(LPAREN)', '<', '{'). 118 119 Params: 120 str = input string 121 nested = whether to skip nested brackets 122 Returns: 123 The index of the closing bracket or -1 for unbalanced strings 124 and strings that don't start with a bracket. 125 */ 126 sizediff_t matchBracket(const(char)[] str, bool nested = true) 127 @safe pure nothrow { 128 if (str.length < 2) return -1; 129 130 char open = str[0], close = void; 131 switch (str[0]) { 132 case '[': close = ']'; break; 133 case '(': close = ')'; break; 134 case '<': close = '>'; break; 135 case '{': close = '}'; break; 136 default: return -1; 137 } 138 139 size_t level = 1; 140 foreach (i, char c; str[1 .. $]) { 141 if (nested && c == open) ++level; 142 else if (c == close) --level; 143 if (level == 0) return i + 1; 144 } 145 return -1; 146 } 147 148 @safe unittest 149 { 150 static struct Test { string str; sizediff_t res; } 151 enum tests = [ 152 Test("[foo]", 4), Test("<bar>", 4), Test("{baz}", 4), 153 Test("[", -1), Test("[foo", -1), Test("ab[f]", -1), 154 Test("[foo[bar]]", 9), Test("[foo{bar]]", 8), 155 ]; 156 foreach (test; tests) 157 assert(matchBracket(test.str) == test.res); 158 assert(matchBracket("[foo[bar]]", false) == 8); 159 static assert(matchBracket("[foo]") == 4); 160 } 161 162 /// Same as std.string.format, just using an allocator. 163 string formatAlloc(ARGS...)(IAllocator alloc, string fmt, ARGS args) 164 { 165 auto app = AllocAppender!string(alloc); 166 formattedWrite(() @trusted { return &app; } (), fmt, args); 167 return () @trusted { return app.data; } (); 168 } 169 170 /// Special version of icmp() with optimization for ASCII characters 171 int icmp2(const(char)[] a, const(char)[] b) 172 @safe pure nothrow { 173 size_t i = 0, j = 0; 174 175 // fast skip equal prefix 176 size_t min_len = min(a.length, b.length); 177 while( i < min_len && a[i] == b[i] ) i++; 178 if( i > 0 && (a[i-1] & 0x80) ) i--; // don't stop half-way in a UTF-8 sequence 179 j = i; 180 181 // compare the differing character and the rest of the string 182 while(i < a.length && j < b.length){ 183 uint ac = cast(uint)a[i]; 184 uint bc = cast(uint)b[j]; 185 if( !((ac | bc) & 0x80) ){ 186 i++; 187 j++; 188 if( ac >= 'A' && ac <= 'Z' ) ac += 'a' - 'A'; 189 if( bc >= 'A' && bc <= 'Z' ) bc += 'a' - 'A'; 190 if( ac < bc ) return -1; 191 else if( ac > bc ) return 1; 192 } else { 193 dchar acp = decode!(Yes.useReplacementDchar)(a, i); 194 dchar bcp = decode!(Yes.useReplacementDchar)(b, j); 195 if( acp != bcp ){ 196 acp = std.uni.toLower(acp); 197 bcp = std.uni.toLower(bcp); 198 if( acp < bcp ) return -1; 199 else if( acp > bcp ) return 1; 200 } 201 } 202 } 203 204 if( i < a.length ) return 1; 205 else if( j < b.length ) return -1; 206 207 assert(i == a.length || j == b.length, "Strings equal but we didn't fully compare them!?"); 208 return 0; 209 }