1 /**
2 Utility functions for string processing
3
4 Copyright: © 2012-2014 Sönke Ludwig
5 License: Subject to the terms of the MIT license, as written in the included LICENSE.txt file.
6 Authors: Sönke Ludwig
7 */
8 module vibe.utils..string;
9
10 public import std.string;
11
12 import vibe.utils.array;
13 import vibe.internal.utilallocator;
14
15 import std.algorithm;
16 import std.array;
17 import std.ascii;
18 import std.format;
19 import std.typecons : Yes;
20 import std.uni;
21 import std.utf;
22 import core.exception;
23
24
25 /**
26 Takes a string with possibly invalid UTF8 sequences and outputs a valid UTF8 string as near to
27 the original as possible.
28 */
29 string sanitizeUTF8(in ubyte[] str)
30 @safe pure {
31 import std.utf;
32 auto ret = appender!string();
33 ret.reserve(str.length);
34
35 size_t i = 0;
36 while (i < str.length) {
37 dchar ch = str[i];
38 try ch = std.utf.decode(cast(const(char[]))str, i);
39 catch( UTFException ){ i++; }
40 //catch( AssertError ){ i++; }
41 char[4] dst;
42 auto len = std.utf.encode(dst, ch);
43 ret.put(dst[0 .. len]);
44 }
45
46 return ret.data;
47 }
48
49 /**
50 Strips the byte order mark of an UTF8 encoded string.
51 This is useful when the string is coming from a file.
52 */
53 inout(char)[] stripUTF8Bom(inout(char)[] str)
54 @safe pure nothrow {
55 if (str.length >= 3 && str[0 .. 3] == [0xEF, 0xBB, 0xBF])
56 return str[3 ..$];
57 return str;
58 }
59
60
61 /**
62 Checks if all characters in 'str' are contained in 'chars'.
63 */
64 bool allOf(const(char)[] str, const(char)[] chars)
65 @safe pure {
66 foreach (dchar ch; str)
67 if (!chars.canFind(ch))
68 return false;
69 return true;
70 }
71
72 /**
73 Checks if any character in 'str' is contained in 'chars'.
74 */
75 bool anyOf(const(char)[] str, const(char)[] chars)
76 @safe pure {
77 foreach (ch; str)
78 if (chars.canFind(ch))
79 return true;
80 return false;
81 }
82
83
84 /// ASCII whitespace trimming (space and tab)
85 inout(char)[] stripLeftA(inout(char)[] s)
86 @safe pure nothrow {
87 while (s.length > 0 && (s[0] == ' ' || s[0] == '\t'))
88 s = s[1 .. $];
89 return s;
90 }
91
92 /// ASCII whitespace trimming (space and tab)
93 inout(char)[] stripRightA(inout(char)[] s)
94 @safe pure nothrow {
95 while (s.length > 0 && (s[$-1] == ' ' || s[$-1] == '\t'))
96 s = s[0 .. $-1];
97 return s;
98 }
99
100 /// ASCII whitespace trimming (space and tab)
101 inout(char)[] stripA(inout(char)[] s)
102 @safe pure nothrow {
103 return stripLeftA(stripRightA(s));
104 }
105
106 /// Finds the first occurence of any of the characters in `chars`
107 sizediff_t indexOfAny(const(char)[] str, const(char)[] chars)
108 @safe pure {
109 foreach (i, char ch; str)
110 if (chars.canFind(ch))
111 return i;
112 return -1;
113 }
114 alias countUntilAny = indexOfAny;
115
116 /**
117 Finds the closing bracket (works with any of '[', '$(LPAREN)', '<', '{').
118
119 Params:
120 str = input string
121 nested = whether to skip nested brackets
122 Returns:
123 The index of the closing bracket or -1 for unbalanced strings
124 and strings that don't start with a bracket.
125 */
126 sizediff_t matchBracket(const(char)[] str, bool nested = true)
127 @safe pure nothrow {
128 if (str.length < 2) return -1;
129
130 char open = str[0], close = void;
131 switch (str[0]) {
132 case '[': close = ']'; break;
133 case '(': close = ')'; break;
134 case '<': close = '>'; break;
135 case '{': close = '}'; break;
136 default: return -1;
137 }
138
139 size_t level = 1;
140 foreach (i, char c; str[1 .. $]) {
141 if (nested && c == open) ++level;
142 else if (c == close) --level;
143 if (level == 0) return i + 1;
144 }
145 return -1;
146 }
147
148 @safe unittest
149 {
150 static struct Test { string str; sizediff_t res; }
151 enum tests = [
152 Test("[foo]", 4), Test("<bar>", 4), Test("{baz}", 4),
153 Test("[", -1), Test("[foo", -1), Test("ab[f]", -1),
154 Test("[foo[bar]]", 9), Test("[foo{bar]]", 8),
155 ];
156 foreach (test; tests)
157 assert(matchBracket(test.str) == test.res);
158 assert(matchBracket("[foo[bar]]", false) == 8);
159 static assert(matchBracket("[foo]") == 4);
160 }
161
162 /// Same as std.string.format, just using an allocator.
163 string formatAlloc(ARGS...)(IAllocator alloc, string fmt, ARGS args)
164 {
165 auto app = AllocAppender!string(alloc);
166 formattedWrite(() @trusted { return &app; } (), fmt, args);
167 return () @trusted { return app.data; } ();
168 }
169
170 /// Special version of icmp() with optimization for ASCII characters
171 int icmp2(const(char)[] a, const(char)[] b)
172 @safe pure nothrow {
173 size_t i = 0, j = 0;
174
175 // fast skip equal prefix
176 size_t min_len = min(a.length, b.length);
177 while( i < min_len && a[i] == b[i] ) i++;
178 if( i > 0 && (a[i-1] & 0x80) ) i--; // don't stop half-way in a UTF-8 sequence
179 j = i;
180
181 // compare the differing character and the rest of the string
182 while(i < a.length && j < b.length){
183 uint ac = cast(uint)a[i];
184 uint bc = cast(uint)b[j];
185 if( !((ac | bc) & 0x80) ){
186 i++;
187 j++;
188 if( ac >= 'A' && ac <= 'Z' ) ac += 'a' - 'A';
189 if( bc >= 'A' && bc <= 'Z' ) bc += 'a' - 'A';
190 if( ac < bc ) return -1;
191 else if( ac > bc ) return 1;
192 } else {
193 dchar acp = decode!(Yes.useReplacementDchar)(a, i);
194 dchar bcp = decode!(Yes.useReplacementDchar)(b, j);
195 if( acp != bcp ){
196 acp = std.uni.toLower(acp);
197 bcp = std.uni.toLower(bcp);
198 if( acp < bcp ) return -1;
199 else if( acp > bcp ) return 1;
200 }
201 }
202 }
203
204 if( i < a.length ) return 1;
205 else if( j < b.length ) return -1;
206
207 assert(i == a.length || j == b.length, "Strings equal but we didn't fully compare them!?");
208 return 0;
209 }