1 /** 2 Utility functions for string processing 3 4 Copyright: © 2012-2014 Sönke Ludwig 5 License: Subject to the terms of the MIT license, as written in the included LICENSE.txt file. 6 Authors: Sönke Ludwig 7 */ 8 module vibe.utils..string; 9 10 public import std.string; 11 12 import vibe.utils.array; 13 import vibe.container.internal.utilallocator; 14 15 import std.algorithm; 16 import std.array; 17 import std.ascii; 18 import std.format; 19 import std.typecons : Yes; 20 import std.uni; 21 import std.utf; 22 import core.exception; 23 24 25 /** 26 Takes a string with possibly invalid UTF8 sequences and outputs a valid UTF8 string as near to 27 the original as possible. 28 */ 29 deprecated("Use `std.encoding.sanitize` instead") 30 string sanitizeUTF8(in ubyte[] str) 31 @safe pure { 32 import std.utf; 33 auto ret = appender!string(); 34 ret.reserve(str.length); 35 36 size_t i = 0; 37 while (i < str.length) { 38 dchar ch = str[i]; 39 try ch = std.utf.decode(cast(const(char[]))str, i); 40 catch( UTFException ){ i++; } 41 //catch( AssertError ){ i++; } 42 char[4] dst; 43 auto len = std.utf.encode(dst, ch); 44 ret.put(dst[0 .. len]); 45 } 46 47 return ret.data; 48 } 49 50 /** 51 Strips the byte order mark of an UTF8 encoded string. 52 This is useful when the string is coming from a file. 53 */ 54 inout(char)[] stripUTF8Bom(inout(char)[] str) 55 @safe pure nothrow { 56 if (str.length >= 3 && str[0 .. 3] == [0xEF, 0xBB, 0xBF]) 57 return str[3 ..$]; 58 return str; 59 } 60 61 62 /** 63 Checks if all characters in 'str' are contained in 'chars'. 64 */ 65 bool allOf(const(char)[] str, const(char)[] chars) 66 @safe pure { 67 foreach (dchar ch; str) 68 if (!chars.canFind(ch)) 69 return false; 70 return true; 71 } 72 73 /** 74 Checks if any character in 'str' is contained in 'chars'. 75 */ 76 bool anyOf(const(char)[] str, const(char)[] chars) 77 @safe pure { 78 foreach (ch; str) 79 if (chars.canFind(ch)) 80 return true; 81 return false; 82 } 83 84 85 /// ASCII whitespace trimming (space and tab) 86 inout(char)[] stripLeftA(inout(char)[] s) 87 @safe pure nothrow { 88 while (s.length > 0 && (s[0] == ' ' || s[0] == '\t')) 89 s = s[1 .. $]; 90 return s; 91 } 92 93 /// ASCII whitespace trimming (space and tab) 94 inout(char)[] stripRightA(inout(char)[] s) 95 @safe pure nothrow { 96 while (s.length > 0 && (s[$-1] == ' ' || s[$-1] == '\t')) 97 s = s[0 .. $-1]; 98 return s; 99 } 100 101 /// ASCII whitespace trimming (space and tab) 102 inout(char)[] stripA(inout(char)[] s) 103 @safe pure nothrow { 104 return stripLeftA(stripRightA(s)); 105 } 106 107 /// Finds the first occurence of any of the characters in `chars` 108 sizediff_t indexOfAny(const(char)[] str, const(char)[] chars) 109 @safe pure { 110 foreach (i, char ch; str) 111 if (chars.canFind(ch)) 112 return i; 113 return -1; 114 } 115 alias countUntilAny = indexOfAny; 116 117 /** 118 Finds the closing bracket (works with any of '[', '$(LPAREN)', '<', '{'). 119 120 Params: 121 str = input string 122 nested = whether to skip nested brackets 123 Returns: 124 The index of the closing bracket or -1 for unbalanced strings 125 and strings that don't start with a bracket. 126 */ 127 sizediff_t matchBracket(const(char)[] str, bool nested = true) 128 @safe pure nothrow { 129 if (str.length < 2) return -1; 130 131 char open = str[0], close = void; 132 switch (str[0]) { 133 case '[': close = ']'; break; 134 case '(': close = ')'; break; 135 case '<': close = '>'; break; 136 case '{': close = '}'; break; 137 default: return -1; 138 } 139 140 size_t level = 1; 141 foreach (i, char c; str[1 .. $]) { 142 if (nested && c == open) ++level; 143 else if (c == close) --level; 144 if (level == 0) return i + 1; 145 } 146 return -1; 147 } 148 149 @safe unittest 150 { 151 static struct Test { string str; sizediff_t res; } 152 enum tests = [ 153 Test("[foo]", 4), Test("<bar>", 4), Test("{baz}", 4), 154 Test("[", -1), Test("[foo", -1), Test("ab[f]", -1), 155 Test("[foo[bar]]", 9), Test("[foo{bar]]", 8), 156 ]; 157 foreach (test; tests) 158 assert(matchBracket(test.str) == test.res); 159 assert(matchBracket("[foo[bar]]", false) == 8); 160 static assert(matchBracket("[foo]") == 4); 161 } 162 163 /// Same as std.string.format, just using an allocator. 164 string formatAlloc(Allocator, ARGS...)(scope Allocator alloc, string fmt, ARGS args) 165 { 166 auto app = AllocAppender!string(alloc); 167 formattedWrite(() @trusted { return &app; } (), fmt, args); 168 return () @trusted { return app.data; } (); 169 } 170 171 /// Special version of icmp() with optimization for ASCII characters 172 int icmp2(const(char)[] a, const(char)[] b) 173 @safe pure nothrow { 174 size_t i = 0, j = 0; 175 176 // fast skip equal prefix 177 size_t min_len = min(a.length, b.length); 178 while( i < min_len && a[i] == b[i] ) i++; 179 if( i > 0 && (a[i-1] & 0x80) ) i--; // don't stop half-way in a UTF-8 sequence 180 j = i; 181 182 // compare the differing character and the rest of the string 183 while(i < a.length && j < b.length){ 184 uint ac = cast(uint)a[i]; 185 uint bc = cast(uint)b[j]; 186 if( !((ac | bc) & 0x80) ){ 187 i++; 188 j++; 189 if( ac >= 'A' && ac <= 'Z' ) ac += 'a' - 'A'; 190 if( bc >= 'A' && bc <= 'Z' ) bc += 'a' - 'A'; 191 if( ac < bc ) return -1; 192 else if( ac > bc ) return 1; 193 } else { 194 dchar acp = decode!(Yes.useReplacementDchar)(a, i); 195 dchar bcp = decode!(Yes.useReplacementDchar)(b, j); 196 if( acp != bcp ){ 197 acp = std.uni.toLower(acp); 198 bcp = std.uni.toLower(bcp); 199 if( acp < bcp ) return -1; 200 else if( acp > bcp ) return 1; 201 } 202 } 203 } 204 205 if( i < a.length ) return 1; 206 else if( j < b.length ) return -1; 207 208 assert(i == a.length || j == b.length, "Strings equal but we didn't fully compare them!?"); 209 return 0; 210 }