1 /**
2 	Utility functions for string processing
3 
4 	Copyright: © 2012-2014 Sönke Ludwig
5 	License: Subject to the terms of the MIT license, as written in the included LICENSE.txt file.
6 	Authors: Sönke Ludwig
7 */
8 module vibe.utils..string;
9 
10 public import std.string;
11 
12 import vibe.utils.array;
13 import vibe.internal.utilallocator;
14 
15 import std.algorithm;
16 import std.array;
17 import std.ascii;
18 import std.format;
19 import std.typecons : Yes;
20 import std.uni;
21 import std.utf;
22 import core.exception;
23 
24 
25 /**
26 	Takes a string with possibly invalid UTF8 sequences and outputs a valid UTF8 string as near to
27 	the original as possible.
28 */
29 string sanitizeUTF8(in ubyte[] str)
30 @safe pure {
31 	import std.utf;
32 	auto ret = appender!string();
33 	ret.reserve(str.length);
34 
35 	size_t i = 0;
36 	while (i < str.length) {
37 		dchar ch = str[i];
38 		try ch = std.utf.decode(cast(const(char[]))str, i);
39 		catch( UTFException ){ i++; }
40 		//catch( AssertError ){ i++; }
41 		char[4] dst;
42 		auto len = std.utf.encode(dst, ch);
43 		ret.put(dst[0 .. len]);
44 	}
45 
46 	return ret.data;
47 }
48 
49 /**
50 	Strips the byte order mark of an UTF8 encoded string.
51 	This is useful when the string is coming from a file.
52 */
53 inout(char)[] stripUTF8Bom(inout(char)[] str)
54 @safe pure nothrow {
55 	if (str.length >= 3 && str[0 .. 3] == [0xEF, 0xBB, 0xBF])
56 		return str[3 ..$];
57 	return str;
58 }
59 
60 
61 /**
62 	Checks if all characters in 'str' are contained in 'chars'.
63 */
64 bool allOf(const(char)[] str, const(char)[] chars)
65 @safe pure {
66 	foreach (dchar ch; str)
67 		if (!chars.canFind(ch))
68 			return false;
69 	return true;
70 }
71 
72 /**
73 	Checks if any character in 'str' is contained in 'chars'.
74 */
75 bool anyOf(const(char)[] str, const(char)[] chars)
76 @safe pure {
77 	foreach (ch; str)
78 		if (chars.canFind(ch))
79 			return true;
80 	return false;
81 }
82 
83 
84 /// ASCII whitespace trimming (space and tab)
85 inout(char)[] stripLeftA(inout(char)[] s)
86 @safe pure nothrow {
87 	while (s.length > 0 && (s[0] == ' ' || s[0] == '\t'))
88 		s = s[1 .. $];
89 	return s;
90 }
91 
92 /// ASCII whitespace trimming (space and tab)
93 inout(char)[] stripRightA(inout(char)[] s)
94 @safe pure nothrow {
95 	while (s.length > 0 && (s[$-1] == ' ' || s[$-1] == '\t'))
96 		s = s[0 .. $-1];
97 	return s;
98 }
99 
100 /// ASCII whitespace trimming (space and tab)
101 inout(char)[] stripA(inout(char)[] s)
102 @safe pure nothrow {
103 	return stripLeftA(stripRightA(s));
104 }
105 
106 /// Finds the first occurence of any of the characters in `chars`
107 sizediff_t indexOfAny(const(char)[] str, const(char)[] chars)
108 @safe pure {
109 	foreach (i, char ch; str)
110 		if (chars.canFind(ch))
111 			return i;
112 	return -1;
113 }
114 alias countUntilAny = indexOfAny;
115 
116 /**
117 	Finds the closing bracket (works with any of '[', '$(LPAREN)', '<', '{').
118 
119 	Params:
120 		str = input string
121 		nested = whether to skip nested brackets
122 	Returns:
123 		The index of the closing bracket or -1 for unbalanced strings
124 		and strings that don't start with a bracket.
125 */
126 sizediff_t matchBracket(const(char)[] str, bool nested = true)
127 @safe pure nothrow {
128 	if (str.length < 2) return -1;
129 
130 	char open = str[0], close = void;
131 	switch (str[0]) {
132 		case '[': close = ']'; break;
133 		case '(': close = ')'; break;
134 		case '<': close = '>'; break;
135 		case '{': close = '}'; break;
136 		default: return -1;
137 	}
138 
139 	size_t level = 1;
140 	foreach (i, char c; str[1 .. $]) {
141 		if (nested && c == open) ++level;
142 		else if (c == close) --level;
143 		if (level == 0) return i + 1;
144 	}
145 	return -1;
146 }
147 
148 @safe unittest
149 {
150 	static struct Test { string str; sizediff_t res; }
151 	enum tests = [
152 		Test("[foo]", 4), Test("<bar>", 4), Test("{baz}", 4),
153 		Test("[", -1), Test("[foo", -1), Test("ab[f]", -1),
154 		Test("[foo[bar]]", 9), Test("[foo{bar]]", 8),
155 	];
156 	foreach (test; tests)
157 		assert(matchBracket(test.str) == test.res);
158 	assert(matchBracket("[foo[bar]]", false) == 8);
159 	static assert(matchBracket("[foo]") == 4);
160 }
161 
162 /// Same as std.string.format, just using an allocator.
163 string formatAlloc(ARGS...)(IAllocator alloc, string fmt, ARGS args)
164 {
165 	auto app = AllocAppender!string(alloc);
166 	formattedWrite(() @trusted { return &app; } (), fmt, args);
167 	return () @trusted { return app.data; } ();
168 }
169 
170 /// Special version of icmp() with optimization for ASCII characters
171 int icmp2(const(char)[] a, const(char)[] b)
172 @safe pure nothrow {
173 	size_t i = 0, j = 0;
174 
175 	// fast skip equal prefix
176 	size_t min_len = min(a.length, b.length);
177 	while( i < min_len && a[i] == b[i] ) i++;
178 	if( i > 0 && (a[i-1] & 0x80) ) i--; // don't stop half-way in a UTF-8 sequence
179 	j = i;
180 
181 	// compare the differing character and the rest of the string
182 	while(i < a.length && j < b.length){
183 		uint ac = cast(uint)a[i];
184 		uint bc = cast(uint)b[j];
185 		if( !((ac | bc) & 0x80) ){
186 			i++;
187 			j++;
188 			if( ac >= 'A' && ac <= 'Z' ) ac += 'a' - 'A';
189 			if( bc >= 'A' && bc <= 'Z' ) bc += 'a' - 'A';
190 			if( ac < bc ) return -1;
191 			else if( ac > bc ) return 1;
192 		} else {
193 			dchar acp = decode!(Yes.useReplacementDchar)(a, i);
194 			dchar bcp = decode!(Yes.useReplacementDchar)(b, j);
195 			if( acp != bcp ){
196 				acp = std.uni.toLower(acp);
197 				bcp = std.uni.toLower(bcp);
198 				if( acp < bcp ) return -1;
199 				else if( acp > bcp ) return 1;
200 			}
201 		}
202 	}
203 
204 	if( i < a.length ) return 1;
205 	else if( j < b.length ) return -1;
206 
207 	assert(i == a.length || j == b.length, "Strings equal but we didn't fully compare them!?");
208 	return 0;
209 }