Context Navigation

source: branches/rsr.v5.1.dev/web/punbb/include/utf8/utils/ascii.php @ 3

Last change on this file since 3 was 3, checked in by dj3c1t, 12 years ago
passage a Fluxbb 1.4.7
File size: 7.6 KB

Line
1	<?php
2
3	/**
4	* Tools to help with ASCII in UTF-8
5	* @version $Id: ascii.php,v 1.5 2006/10/16 20:38:12 harryf Exp $
6	* @package utf8
7	* @subpackage ascii
8	*/
9
10	/**
11	* Tests whether a string contains only 7bit ASCII bytes.
12	* You might use this to conditionally check whether a string
13	* needs handling as UTF-8 or not, potentially offering performance
14	* benefits by using the native PHP equivalent if it's just ASCII e.g.;
15	*
16	* <code>
17	* if ( utf8_is_ascii($someString) ) {
18	* // It's just ASCII - use the native PHP version
19	* $someString = strtolower($someString);
20	* } else {
21	* $someString = utf8_strtolower($someString);
22	* }
23	* </code>
24	*
25	* @param string
26	* @return boolean TRUE if it's all ASCII
27	* @package utf8
28	* @subpackage ascii
29	* @see utf8_is_ascii_ctrl
30	*/
31	function utf8_is_ascii($str)
32	{
33	// Search for any bytes which are outside the ASCII range...
34	return (preg_match('/(?:[^\x00-\x7F])/', $str) !== 1);
35	}
36
37	/**
38	* Tests whether a string contains only 7bit ASCII bytes with device
39	* control codes omitted. The device control codes can be found on the
40	* second table here: http://www.w3schools.com/tags/ref_ascii.asp
41	*
42	* @param string
43	* @return boolean TRUE if it's all ASCII without device control codes
44	* @package utf8
45	* @subpackage ascii
46	* @see utf8_is_ascii
47	*/
48	function utf8_is_ascii_ctrl($str)
49	{
50	// Search for any bytes which are outside the ASCII range, or are device control codes
51	if (strlen($str) > 0)
52	return (preg_match('/[^\x09\x0A\x0D\x20-\x7E]/', $str) !== 1);
53
54	return false;
55	}
56
57	/**
58	* Strip out all non-7bit ASCII bytes
59	* If you need to transmit a string to system which you know can only
60	* support 7bit ASCII, you could use this function.
61	* @param string
62	* @return string with non ASCII bytes removed
63	* @package utf8
64	* @subpackage ascii
65	* @see utf8_strip_non_ascii_ctrl
66	*/
67	function utf8_strip_non_ascii($str)
68	{
69	ob_start();
70
71	while (preg_match('/^([\x00-\x7F]+)\|([^\x00-\x7F]+)/S', $str, $matches))
72	{
73	if (!isset($matches[2]))
74	echo $matches[0];
75
76	$str = substr($str, strlen($matches[0]));
77	}
78
79	$result = ob_get_contents();
80	ob_end_clean();
81
82	return $result;
83	}
84
85	/**
86	* Strip out device control codes in the ASCII range
87	* which are not permitted in XML. Note that this leaves
88	* multi-byte characters untouched - it only removes device
89	* control codes
90	* @see http://hsivonen.iki.fi/producing-xml/#controlchar
91	* @param string
92	* @return string control codes removed
93	*/
94	function utf8_strip_ascii_ctrl($str)
95	{
96	ob_start();
97
98	while (preg_match('/^([^\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+)\|([\x00-\x08\x0B\x0C\x0E-\x1F\x7F]+)/S', $str, $matches))
99	{
100	if (!isset($matches[2]))
101	echo $matches[0];
102
103	$str = substr($str, strlen($matches[0]));
104	}
105
106	$result = ob_get_contents();
107	ob_end_clean();
108
109	return $result;
110	}
111
112	/**
113	* Strip out all non 7bit ASCII bytes and ASCII device control codes.
114	* For a list of ASCII device control codes see the 2nd table here:
115	* http://www.w3schools.com/tags/ref_ascii.asp
116	*
117	* @param string
118	* @return boolean TRUE if it's all ASCII
119	* @package utf8
120	* @subpackage ascii
121	*/
122	function utf8_strip_non_ascii_ctrl($str)
123	{
124	ob_start();
125
126	while (preg_match( '/^([\x09\x0A\x0D\x20-\x7E]+)\|([^\x09\x0A\x0D\x20-\x7E]+)/S', $str, $matches))
127	{
128	if (!isset($matches[2]))
129	echo $matches[0];
130
131	$str = substr($str, strlen($matches[0]));
132	}
133
134	$result = ob_get_contents();
135	ob_end_clean();
136
137	return $result;
138	}
139
140	/**
141	* Replace accented UTF-8 characters by unaccented ASCII-7 "equivalents".
142	* The purpose of this function is to replace characters commonly found in Latin
143	* alphabets with something more or less equivalent from the ASCII range. This can
144	* be useful for converting a UTF-8 to something ready for a filename, for example.
145	* Following the use of this function, you would probably also pass the string
146	* through utf8_strip_non_ascii to clean out any other non-ASCII chars
147	* Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
148	* letters. Default is to deaccent both cases ($case = 0)
149	*
150	* For a more complete implementation of transliteration, see the utf8_to_ascii package
151	* available from the phputf8 project downloads:
152	* http://prdownloads.sourceforge.net/phputf8
153	*
154	* @param string UTF-8 string
155	* @param int (optional) -1 lowercase only, +1 uppercase only, 1 both cases
156	* @param string UTF-8 with accented characters replaced by ASCII chars
157	* @return string accented chars replaced with ascii equivalents
158	* @author Andreas Gohr <andi@splitbrain.org>
159	* @package utf8
160	* @subpackage ascii
161	*/
162	function utf8_accents_to_ascii($str, $case=0)
163	{
164	static $UTF8_LOWER_ACCENTS = null;
165	static $UTF8_UPPER_ACCENTS = null;
166
167	if($case <= 0)
168	{
169
170	if (is_null($UTF8_LOWER_ACCENTS))
171	{
172	$UTF8_LOWER_ACCENTS = array(
173	'Ã ' => 'a', 'ÃŽ' => 'o', 'Ä' => 'd', 'áž' => 'f', 'Ã«' => 'e', 'Å¡' => 's', 'Æ¡' => 'o',
174	'Ã' => 'ss', 'Ä' => 'a', 'Å' => 'r', 'È' => 't', 'Å' => 'n', 'Ä' => 'a', 'Ä·' => 'k',
175	'Å' => 's', 'á»³' => 'y', 'Å' => 'n', 'Äº' => 'l', 'Ä§' => 'h', 'á¹' => 'p', 'Ã³' => 'o',
176	'Ãº' => 'u', 'Ä' => 'e', 'Ã©' => 'e', 'Ã§' => 'c', 'áº' => 'w', 'Ä' => 'c', 'Ãµ' => 'o',
177	'á¹¡' => 's', 'Ãž' => 'o', 'Ä£' => 'g', 'Å§' => 't', 'È' => 's', 'Ä' => 'e', 'Ä' => 'c',
178	'Å' => 's', 'Ã®' => 'i', 'Å±' => 'u', 'Ä' => 'c', 'Ä' => 'e', 'Åµ' => 'w', 'á¹«' => 't',
179	'Å«' => 'u', 'Ä' => 'c', 'Ã¶' => 'oe', 'Ãš' => 'e', 'Å·' => 'y', 'Ä' => 'a', 'Å' => 'l',
180	'Å³' => 'u', 'Å¯' => 'u', 'Å' => 's', 'Ä' => 'g', 'ÄŒ' => 'l', 'Æ' => 'f', 'ÅŸ' => 'z',
181	'áº' => 'w', 'áž' => 'b', 'Ã¥' => 'a', 'Ã¬' => 'i', 'Ã¯' => 'i', 'áž' => 'd', 'Å¥' => 't',
182	'Å' => 'r', 'Ã€' => 'ae', 'Ã' => 'i', 'Å' => 'r', 'Ãª' => 'e', 'ÃŒ' => 'ue', 'Ã²' => 'o',
183	'Ä' => 'e', 'Ã±' => 'n', 'Å' => 'n', 'Ä¥' => 'h', 'Ä' => 'g', 'Ä' => 'd', 'Äµ' => 'j',
184	'Ã¿' => 'y', 'Å©' => 'u', 'Å' => 'u', 'Æ°' => 'u', 'Å£' => 't', 'Ãœ' => 'y', 'Å' => 'o',
185	'Ã¢' => 'a', 'ÄŸ' => 'l', 'áº' => 'w', 'ÅŒ' => 'z', 'Ä«' => 'i', 'Ã£' => 'a', 'Ä¡' => 'g',
186	'á¹' => 'm', 'Å' => 'o', 'Ä©' => 'i', 'Ã¹' => 'u', 'Ä¯' => 'i', 'Åº' => 'z', 'Ã¡' => 'a',
187	'Ã»' => 'u', 'ÃŸ' => 'th', 'Ã°' => 'dh', 'ÃŠ' => 'ae', 'Âµ' => 'u', 'Ä' => 'e',
188	);
189	}
190
191	$str = str_replace(array_keys($UTF8_LOWER_ACCENTS), array_values($UTF8_LOWER_ACCENTS), $str);
192	}
193
194	if($case >= 0)
195	{
196	if (is_null($UTF8_UPPER_ACCENTS))
197	{
198	$UTF8_UPPER_ACCENTS = array(
199	'Ã' => 'A', 'Ã' => 'O', 'Ä' => 'D', 'áž' => 'F', 'Ã' => 'E', 'Å ' => 'S', 'Æ ' => 'O',
200	'Ä' => 'A', 'Å' => 'R', 'È' => 'T', 'Å' => 'N', 'Ä' => 'A', 'Ä¶' => 'K',
201	'Å' => 'S', 'á»²' => 'Y', 'Å' => 'N', 'Ä¹' => 'L', 'ÄŠ' => 'H', 'á¹' => 'P', 'Ã' => 'O',
202	'Ã' => 'U', 'Ä' => 'E', 'Ã' => 'E', 'Ã' => 'C', 'áº' => 'W', 'Ä' => 'C', 'Ã' => 'O',
203	'á¹ ' => 'S', 'Ã' => 'O', 'Ä¢' => 'G', 'ÅŠ' => 'T', 'È' => 'S', 'Ä' => 'E', 'Ä' => 'C',
204	'Å' => 'S', 'Ã' => 'I', 'Å°' => 'U', 'Ä' => 'C', 'Ä' => 'E', 'ÅŽ' => 'W', 'á¹ª' => 'T',
205	'Åª' => 'U', 'Ä' => 'C', 'Ã' => 'Oe', 'Ã' => 'E', 'Å¶' => 'Y', 'Ä' => 'A', 'Å' => 'L',
206	'Å²' => 'U', 'Å®' => 'U', 'Å' => 'S', 'Ä' => 'G', 'Ä»' => 'L', 'Æ' => 'F', 'Åœ' => 'Z',
207	'áº' => 'W', 'áž' => 'B', 'Ã' => 'A', 'Ã' => 'I', 'Ã' => 'I', 'áž' => 'D', 'Å€' => 'T',
208	'Å' => 'R', 'Ã' => 'Ae', 'Ã' => 'I', 'Å' => 'R', 'Ã' => 'E', 'Ã' => 'Ue', 'Ã' => 'O',
209	'Ä' => 'E', 'Ã' => 'N', 'Å' => 'N', 'Ä€' => 'H', 'Ä' => 'G', 'Ä' => 'D', 'ÄŽ' => 'J',
210	'Åž' => 'Y', 'Åš' => 'U', 'Å¬' => 'U', 'Æ¯' => 'U', 'Å¢' => 'T', 'Ã' => 'Y', 'Å' => 'O',
211	'Ã' => 'A', 'Äœ' => 'L', 'áº' => 'W', 'Å»' => 'Z', 'Äª' => 'I', 'Ã' => 'A', 'Ä ' => 'G',
212	'á¹' => 'M', 'Å' => 'O', 'Äš' => 'I', 'Ã' => 'U', 'Ä®' => 'I', 'Å¹' => 'Z', 'Ã' => 'A',
213	'Ã' => 'U', 'Ã' => 'Th', 'Ã' => 'Dh', 'Ã' => 'Ae', 'Ä' => 'E',
214	);
215	}
216
217	$str = str_replace(array_keys($UTF8_UPPER_ACCENTS), array_values($UTF8_UPPER_ACCENTS), $str);
218	}
219
220	return $str;
221	}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: